def _reset(self): self._state = 0 self.last_call_thread_id = threading.current_thread().ident return ts.restart([self._state], batch_size=1, reward_spec=self._reward_spec)
def testSparseObs(self, batch_size, actions_from_reward_layer): obs_spec = { 'global': {'sport': tensor_spec.TensorSpec((), tf.string)}, 'per_arm': { 'name': tensor_spec.TensorSpec((3,), tf.string), 'fruit': tensor_spec.TensorSpec((3,), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) dummy_net = arm_network.create_feed_forward_common_tower_network( obs_spec, global_layers=(3, 4, 5), arm_layers=(3, 2), common_layers=(4, 3), output_dim=self._encoding_dim, global_preprocessing_combiner=(tf.compat.v2.keras.layers.DenseFeatures( [columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_a, columns_b])) time_step_spec = ts.time_step_spec(obs_spec) reward_layer = get_per_arm_reward_layer(encoding_dim=self._encoding_dim) policy = neural_linucb_policy.NeuralLinUCBPolicy( dummy_net, self._encoding_dim, reward_layer, actions_from_reward_layer=tf.constant( actions_from_reward_layer, dtype=tf.bool), cov_matrix=self._a[0:1], data_vector=self._b[0:1], num_samples=self._num_samples_per_arm[0:1], epsilon_greedy=0.0, time_step_spec=time_step_spec, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate([tf.compat.v1.global_variables_initializer(), tf.compat.v1.tables_initializer()]) action = self.evaluate(action_step.action) self.assertAllEqual(action.shape, [2]) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] first_arm_name_feature = observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])
def _reset(self): self.set_state(self._initial_state) self._episode_ended = False return ts.restart(self._to_observation())
def testDistributionRaisesNotImplementedError(self): mock_tf_py_policy = tf_py_policy.TFPyPolicy(self._get_mock_py_policy()) observation = tf.ones([5], tf.float32) time_step = ts.restart(observation) with self.assertRaises(NotImplementedError): mock_tf_py_policy.distribution(time_step=time_step)
def testRestartIsFirst(self): observation = tf.constant(-1) time_step = ts.restart(observation) is_first = time_step.is_first() self.assertEqual(True, self.evaluate(is_first))
def _time_step(self): return ts.restart(tf.constant([1, 2], dtype=tf.float32))
def _reset(self): if self._current_time_step and self._current_time_step.is_last(): self._episodes += 1 self._steps = 0 return ts.restart(self._get_observation())
def setUp(self): super(RandomPyPolicyTest, self).setUp() self._time_step_spec = time_step.time_step_spec( observation_spec=array_spec.ArraySpec((1, ), np.int32)) self._time_step = time_step.restart(observation=np.array([1]))
def _reset(self): self._count = np.array(0, dtype=np.int32) return ts.restart(self._count.copy())
def setUp(self): super(ScriptedPyPolicyTest, self).setUp() self._obs_spec = array_spec.ArraySpec((), np.int32, 'obs') self._time_step_spec = ts.time_step_spec(self._obs_spec) self._time_step = ts.restart(observation=1) # pytype: disable=wrong-arg-types
def _reset(self): self._state = np.zeros(2, dtype=np.int32) self._counter = 0 self._done = False return ts.restart(self._state)
def testPerArmRewardsSparseObs(self): if not tf.executing_eagerly(): return tf.compat.v1.set_random_seed(3000) obs_spec = { 'global': {'sport': tensor_spec.TensorSpec((), tf.string)}, 'per_arm': { 'name': tensor_spec.TensorSpec((3,), tf.string), 'fruit': tensor_spec.TensorSpec((3,), tf.string) } } columns_a = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'name', ['bob', 'george', 'wanda'])) columns_b = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'fruit', ['banana', 'kiwi', 'pear'])) columns_c = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_list( 'sport', ['bridge', 'chess', 'snooker'])) reward_network = ( global_and_arm_feature_network.create_feed_forward_common_tower_network( observation_spec=obs_spec, global_layers=(4, 3, 2), arm_layers=(6, 5, 4), common_layers=(7, 6, 5), global_preprocessing_combiner=( tf.compat.v2.keras.layers.DenseFeatures([columns_c])), arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures( [columns_a, columns_b]))) time_step_spec = ts.time_step_spec(obs_spec) action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2) policy = greedy_reward_policy.GreedyRewardPredictionPolicy( time_step_spec, action_spec, reward_network=reward_network, accepts_per_arm_features=True, emit_policy_info=('predicted_rewards_mean',)) observations = { 'global': { 'sport': tf.constant(['snooker', 'chess']) }, 'per_arm': { 'name': tf.constant([['george', 'george', 'george'], ['bob', 'bob', 'bob']]), 'fruit': tf.constant([['banana', 'banana', 'banana'], ['kiwi', 'kiwi', 'kiwi']]) } } time_step = ts.restart(observations, batch_size=2) action_step = policy.action(time_step, seed=1) self.assertEqual(action_step.action.shape.as_list(), [2]) self.assertEqual(action_step.action.dtype, tf.int32) # Initialize all variables self.evaluate(tf.compat.v1.global_variables_initializer()) action = self.evaluate(action_step.action) self.assertAllEqual(action.shape, [2]) p_info = self.evaluate(action_step.info) self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3]) self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2]) self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2]) first_action = action[0] first_arm_name_feature = observations[ bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0] self.assertAllEqual(p_info.chosen_arm_features['name'][0], first_arm_name_feature[first_action])
def _reset(self): observations = self._parallel_env.reset() self._dones = [False] * self._num_envs timesteps = ts.restart(observations, batch_size=self._num_envs) return timesteps
def _reset(self): self._state = 0 self.resets += 1 self.last_call_thread_id = threading.current_thread().ident return ts.restart([self._state])
def _reset(self) -> ts.TimeStep: self._pasture_engine.reset() return ts.restart(self._pasture_engine.state())
def testGetEpochLoss(self): agent = ppo_agent.PPOAgent( self._time_step_spec, self._action_spec, tf.compat.v1.train.AdamOptimizer(), actor_net=DummyActorNet(self._obs_spec, self._action_spec), value_net=DummyValueNet(self._obs_spec), normalize_observations=False, normalize_rewards=False, value_pred_loss_coef=1.0, policy_l2_reg=1e-4, value_function_l2_reg=1e-4, entropy_regularization=0.1, importance_ratio_clipping=10, ) observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32) returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3], dtype=tf.float32) advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32) weights = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32) sample_action_distribution_parameters = { 'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]], dtype=tf.float32), 'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]], dtype=tf.float32), } train_step = tf.compat.v1.train.get_or_create_global_step() loss_info = agent.get_epoch_loss(time_steps, actions, sample_action_log_probs, returns, advantages, sample_action_distribution_parameters, weights, train_step, debug_summaries=False) self.evaluate(tf.compat.v1.global_variables_initializer()) total_loss, extra_loss_info = self.evaluate(loss_info) (policy_gradient_loss, value_estimation_loss, l2_regularization_loss, entropy_reg_loss, kl_penalty_loss) = extra_loss_info # Check loss values are as expected. Factor of 2/4 is because four timesteps # were included in the data, but two were masked out. Reduce_means in losses # will divide by 4, but computed loss values are for first 2 timesteps. expected_pg_loss = -0.0164646133 * 2 / 4 expected_ve_loss = 123.205 * 2 / 4 expected_l2_loss = 1e-4 * 12 * 2 / 4 expected_ent_loss = -0.370111 * 2 / 4 expected_kl_penalty_loss = 0.0 self.assertAllClose(expected_pg_loss + expected_ve_loss + expected_l2_loss + expected_ent_loss + expected_kl_penalty_loss, total_loss, atol=0.001, rtol=0.001) self.assertAllClose(expected_pg_loss, policy_gradient_loss) self.assertAllClose(expected_ve_loss, value_estimation_loss) self.assertAllClose(expected_l2_loss, l2_regularization_loss, atol=0.001, rtol=0.001) self.assertAllClose(expected_ent_loss, entropy_reg_loss) self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss)
def _reset(self): self._state = np.array(self.env.reset(), dtype=np.float32) self._episode_ended = False return ts.restart(self._state)
def _reset(self): self._state = 0 return ts.restart(self._state)
def _reset(self): self._state = np.int32(0) return ts.restart(self._state)
def _reset(self): self._game.reset() return timeStep.restart(self._game.game_state())
def _reset(self): if self._current_time_step and self._current_time_step.is_last(): self._episodes += 1 self._current_step = np.array(0, dtype=self._dtype) return ts.restart(self._get_observation())
def _reset(self): self.env.reset() self._state = self.env.encoded_state() self._episode_ended = False return ts.restart(self._state)
def _reset(self): self._done = False self._ready_state = self._env.reset() self._prev_time_steps = [[None for _ in g] for g in self._ready_state] self._prev_actions = [[None for _ in g] for g in self._ready_state] return ts.restart(self._ready_state[0][0])
def _reset(self): self.game.game_reset() self._episode_ended = False return ts.restart(self.obs)
def testRestartIsFirst(self): observation = -1 time_step = ts.restart(observation) self.assertTrue(time_step.is_first())
def _reset(self): self._state = self.job_info.get_observation() self._episode_ended = False self.step_count = 0 self.assigned_job = [] return ts.restart(np.array(self._state, dtype=np.float32))
def _reset(self): self.reset_board() return ts.restart(self._state)
def _reset(self): """Resets the wrapper.""" self._state = np.array(self._env.reset(), dtype=np.float32) self._episode_ended = False return ts.restart(self._state)
def _reset(self): self.prev_bx = 0.0 self.prev_bdy = 0.0 return ts.restart(self.env.reset())
from Trade import Trade from tf_agents.trajectories import time_step as ts trader = Trade() import random stop = -500 gain = 500 trader.reset() action = 0 for i in range(len(dados1)): compra,venda,neg,ficha,comprado,vendido,recompensa= trader.agente(dados1.values[i],action,stop,gain,0) # print('estado: ',dados2.values[i]) observations = tf.constant([[dados2.values[i]]]) time_step = ts.restart(observations,1) action2 = saved_policy.action(time_step) # time_step = ts.transition(observations,1) # action2 = agent.policy.action(time_step) action = action2.action.numpy()[0] print(i,'------------------') print('acao: ',action) print('comprado: ',comprado) print('vendido: ',vendido) print('recompensa: ',recompensa) print('recompensa: ',time_step.reward.numpy(),' action: ',action2.action.numpy()[0]) print(sum(neg.ganhofinal))