def _step(self, action): if self._done: return self.reset() if self._action_spec: tf.nest.assert_same_structure(self._action_spec, action) self._num_steps += 1 observation = self._get_observation() if self._num_steps < self._min_duration: self._done = False elif self._max_duration and self._num_steps >= self._max_duration: self._done = True else: self._done = self._rng.uniform() < self._episode_end_probability if self._done: reward = self._reward_fn(ts.StepType.LAST, action, observation) self._check_reward_shape(reward) time_step = ts.termination(observation, reward) self._num_steps = 0 else: reward = self._reward_fn(ts.StepType.MID, action, observation) self._check_reward_shape(reward) time_step = ts.transition(observation, reward, self._discount) return time_step
def _terminate(self, reward): plog( "Player: {} -> {}. Dealer: {} -> {}. Reward: {}.", self._player_cards, self._player_cards.sum(), self._dealer_cards, self._dealer_cards.sum(), reward) self._episode_ended = True return time_step.termination(self._state(), reward)
def testTermination(self): observation = -1 reward = 2.0 time_step = ts.termination(observation, reward) self.assertEqual(ts.StepType.LAST, time_step.step_type) self.assertEqual(-1, time_step.observation) self.assertEqual(2.0, time_step.reward) self.assertEqual(0.0, time_step.discount)
def testTermination(self): observation = tf.constant(-1) reward = tf.constant(2.0) time_step = ts.termination(observation, reward) time_step_ = self.evaluate(time_step) self.assertEqual(ts.StepType.LAST, time_step_.step_type) self.assertEqual(-1, time_step_.observation) self.assertEqual(2.0, time_step_.reward) self.assertEqual(0.0, time_step_.discount)
def testTerminationBatched(self): observation = np.array([[-1], [-1]]) reward = np.array([2., 2.]) time_step = ts.termination(observation, reward) self.assertItemsEqual([ts.StepType.LAST] * 2, time_step.step_type) self.assertItemsEqual(observation, time_step.observation) self.assertItemsEqual(reward, time_step.reward) self.assertItemsEqual([0., 0.], time_step.discount)
def _step(self, action): self._state = (self._state + 1) % 3 self.steps += 1 self.actions_taken.append(action) observation = [self._state] if self._state == 0: return ts.restart(observation) elif self._state == 2: self.episodes += 1 return ts.termination(observation, reward=1.0) return ts.transition(observation, reward=0.0)
def _step(self, action): if action < self._action_spec.minimum or action > self._action_spec.maximum: raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format( self._action_spec.minimum, self._action_spec.maximum, action)) if self._state >= self._final_state: # Start a new episode. Ignore action self._state = 0 return ts.restart(self._state) self._state += action if self._state < self._final_state: return ts.transition(self._state, 1.) else: return ts.termination(self._state, 1.)
def _step(self, action): # Automatically reset the environments on step if they need to be reset. if self._auto_reset and self._done: return self.reset() # TODO(oars): Figure out how tuple or dict actions will be generated by the # agents and if we can pass them through directly to gym. observation, reward, self._done, self._info = self._gym_env.step(action) if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) if self._done: return ts.termination(observation, reward) else: return ts.transition(observation, reward, self._discount)
def _step(self, action): if self._episode_ended: return self.reset() self.move(action) if self.game_over(): self._episode_ended = True if self._episode_ended: if self.game_over(): reward = 100 else: reward = 0 return ts.termination(np.array(self._state, dtype=np.int32), reward) else: return ts.transition(np.array(self._state, dtype=np.int32), reward=0, discount=0.9)
def _step(self, action): if self._do_record: self._write_log_entry(action) if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() iscore = self._game.get_score() # Input agent action if action == self._UP: self._game.move_up() elif action == self._DOWN: self._game.move_down() elif action == self._LEFT: self._game.move_left() elif action == self._RIGHT: self._game.move_right() else: raise ValueError('`action` should be between 0 and 3 (inclusive).') # Get state after the agent action is taken state_buffer = self._state self._state = self._game.get_flat_board() if self._game.is_game_over() or np.array_equal(state_buffer, self._state): self._episode_ended = True reward = self._game.get_score() - iscore # Set rewards if self._episode_ended: # return with a reward of 0 return ts.termination(self._state, 0.0) else: return ts.transition(self._state, reward=reward, discount=1.0)
def _step(self, action): if self._episode_ended: return self.reset() if action == self.ACTION_END_GAME: self._episode_ended = True elif action == self.ACTION_GET_NEW_CARD: new_card = np.random.randint(1, 11) self._state += new_card print("New card: {}, Sum: {}".format(new_card, self._state)) else: raise ValueError("`action` should be {} or {}".format( self.ACTION_GET_NEW_CARD, self.ACTION_END_GAME)) if self._episode_ended or self._state >= self.LIMIT_STATE: reward = self._state if self._state <= self.LIMIT_STATE else -99 print("End of game, rewarded", reward) return time_step.termination( np.array([self._state], dtype=np.int32), reward) return time_step.transition(np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) else: return ts.transition(sample_fn(), 1.0)
def testTerminationIsLast(self): observation = -1 reward = 2.0 time_step = ts.termination(observation, reward) self.assertTrue(time_step.is_last())
def testTerminationIsLast(self): observation = tf.constant(-1) reward = tf.constant(2.0) time_step = ts.termination(observation, reward) is_last = time_step.is_last() self.assertEqual(True, self.evaluate(is_last))
def ts_termination(observation): return ts.termination(observation=observation, reward=np.array(1, dtype=np.float32))
def _step(self, action): # The last action ended the episode. # Ignore the current action and start a new episode if self._episode_ended: return self.reset() action_value = action - MAX_AMOUNT/2 step = self._step_num amount = self._amount row = self._df.iloc[[step]] buy = row['buy'].item() sale = row['sale'].item() reward = 0 if action_value > 0: # buying currency reward = - sale * action_value elif action_value < 0 and amount >= np.abs(action_value): # selling currency reward = buy * np.abs(action_value) new_amount = amount + action_value # take action self._step_num += 1 if self._step_num == self.env_size: self._episode_ended = True if 0 <= new_amount <= MAX_AMOUNT: amount = new_amount else: raise RuntimeError( 'Wrong action is produced by policy: {}, {}: a{}, s{}'.format( action, action_value, amount, step)) reward = WRONG_ACTION_REWARD if VERBOSE: print( '#{step} ({amount}->{new_amount}): ' '{buy}/{sale}; {action}; {reward}'.format( step=step, amount=self._amount, new_amount=amount, buy=buy, sale=sale, action=action_value, reward=reward, )) # Update amount after action taken and return the observation self._amount = amount observation = self._get_observation(step) if self._episode_ended: return ts.termination( observation=observation, reward=reward, ) return ts.transition( observation=observation, reward=reward, discount=1.0, )