def _step(self, action): self._counter += 1 if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() self._state[self._counter - 1, 0] = action[0] self._state[self._counter - 1, 1] = action[1] # Make sure episodes don't go on forever. if self._counter > self._END: self._episode_ended = True s1 = self._state[11, 0] a1 = self._state[self._counter - 1, 0] * s1 s2 = self._state[11, 1] a2 = self._state[self._counter - 1, 1] * s2 self._state[11, 0] = s1 - a1 + (a1 + a2) * self._MULT / 2 self._state[11, 1] = s2 - a2 + (a1 + a2) * self._MULT / 2 if self._episode_ended: reward_self = (self._state[11, 0] - s1) / s1 reward_other = (self._state[11, 1] - s2) / s2 my_reward = reward_fun(reward_self, reward_other) ret = ts.termination(np.array(self._state, dtype=np.float32), my_reward) self._counter += 1 return ret else: reward_self = (self._state[11, 0] - s1) / s1 reward_other = (self._state[11, 1] - s2) / s2 my_reward = reward_fun(reward_self, reward_other) ret = ts.transition(np.array(self._state, dtype=np.float32), reward=my_reward, discount=1.0) for i in range(2): self._state[self._counter + 1, i] += self._state[self._counter, i] #print("transition:", ret) return ret
def step_adversary(self, action): action = action.item() if self._action_is_discrete else action observation, reward, self._done, self._info = self._gym_env.step_adversary( action) if self._match_obs_space_dtype: observation = self._adversary_to_obs_space_dtype(observation) reward = np.asarray(reward, dtype=self.reward_spec().dtype) outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec()) if self._done: return ts_lib.termination(observation, reward, outer_dims=outer_dims) else: return ts_lib.transition(observation, reward, self._discount, outer_dims=outer_dims)
def _step(self, action): if self._game.is_episode_finished(): # The last action ended the episode. Ignore the current action and start a new episode. return self.reset() for i in range(self._frame_skip): if i == 0: self.take_action(action) else: self.take_action(0) if self._game.is_episode_finished(): return ts.termination(self.get_screen_buffer_preprocessed(), self.get_reward()) return ts.transition(self.get_screen_buffer_preprocessed(), self.get_reward())
def _step(self, action): """ Performs the action as a move on the board. :param action: the move to perform, a color from 0-4 referring to the REGULARIZED board. :return: the next time_step. """ if self._episode_ended: return self.reset() if self.state.move_count > 200: print("Canceling game due to stalling.") reward = -1 self._episode_ended = True else: # We need to map to the normal colors using the reverse mapping. new_owned_count = self.state.move(self.mapping.index(action)) if self.state.last_move_illegal: reward = -1 print("INFO: Illegal move!") else: reward = new_owned_count self.total_score += new_owned_count if not self.state.is_final_state: self.state.move(random.randint(0, 5)) if self.state.is_final_state: self._episode_ended = True if self.total_score > 28: # print("Won!") pass # reward += 10 elif self.total_score < 28: pass # print("Lost!") # elif self.total_score < 28: # reward -= 10 # print(str_board(self.state.board)) # print(self.total_score) self.rewards.append(reward) self.__update_mapping() self.__regularize_board() if self._episode_ended: return ts.termination(self.regularized_board, reward) else: # TODO was fürn discount? return ts.transition(self.regularized_board, reward, 0.7)
def visualize(self, num_episodes=1): # Ticket (https://github.com/tensorflow/agents/issues/59) recommends # to do the rendering in the original environment if self._unwrapped_runtime is not None: for _ in range(0, num_episodes): state = self._unwrapped_runtime.reset() is_terminal = False while not is_terminal: print(state) action_step = self._agent._eval_policy.action( ts.transition(state, reward=0.0, discount=1.0)) # print(action_step) # TODO(@hart); make generic for multi agent planning state, reward, is_terminal, _ = self._unwrapped_runtime.step( action_step.action.numpy()) # print(reward) self._unwrapped_runtime.render()
def _step(self, action): if action < self._action_spec.minimum or action > self._action_spec.maximum: raise ValueError( 'Action should be in [{0}, {1}], but saw: {2}'.format( self._action_spec.minimum, self._action_spec.maximum, action)) if self._state >= self._final_state: # Start a new episode. Ignore action self._state = 0 return ts.restart(self._state) self._state += action if self._state < self._final_state: return ts.transition(self._state, 1.) else: return ts.termination(self._state, 1.)
def _step(self, action): """ Step, action is velocities of left/right wheel """ # Reset if ended if self._episode_ended: return self.reset() self._num_steps += 1 # Step the environment self._env.step(self._actions[action]) done, reward = self._compute_reward() # Compute and save states self.set_state() self._episode_ended = done # Transition if self._episode_ended: return ts.termination(self._state, reward) else: return ts.transition(self._state, reward)
def _step(self, action): """ Execute one time step in the environment. Input: action -- dictionary of batched actions Output: TimeStep object (see tf-agents docs) """ self._state, info, obs = self.control_circuit(self._state, action) self.info['psi_cached'] = info # Calculate rewards self._elapsed_steps += 1 self._episode_ended = (self._elapsed_steps == self.episode_length) # Add dummy time dimension to tensors and append them to history for a in action.keys(): self.history[a].append(action[a]) self.history['msmt'].append(obs) # Make observations of 'msmt' of horizon H, shape=[batch_size,H] # measurements are selected with hard-coded attention step. # Also add clock of period 'T' to observations, shape=[batch_size,T] observation = {} H = [ self.history['msmt'][-self.attn_step * i - 1] for i in range(self.H) ] H.reverse() # to keep chronological order of measurements observation['msmt'] = tf.concat(H, axis=1) C = tf.one_hot([self._elapsed_steps % self.T] * self.batch_size, self.T) observation['clock'] = C observation['const'] = tf.ones(shape=[self.batch_size, 1]) reward = self.calculate_reward(action) self._episode_return += reward if self._episode_ended: self._epoch += 1 self._current_time_step_ = ts.termination(observation, reward) else: self._current_time_step_ = ts.transition(observation, reward) return self.current_time_step()
def _step(self, action): # Automatically reset the environments on step if they need to be reset. if self._auto_reset and self._done: return self.reset() # TODO(oars): Figure out how tuple or dict actions will be generated by the # agents and if we can pass them through directly to gym. observation, reward, self._done, self._info = self._gym_env.step(action) if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) if self._done: return ts.termination(observation, reward) else: return ts.transition(observation, reward, self._discount)
def _step(self, action): if action < self._action_spec.minimum or action > self._action_spec.maximum: raise ValueError('Action should be in [{0}, {1}], but saw: {2}'.format( self._action_spec.minimum, self._action_spec.maximum, action)) if action.shape != (): # pylint: disable=g-explicit-bool-comparison raise ValueError('Action should be a scalar.') if self._state >= self._final_state: # Start a new episode. Ignore action return self.reset() self._state += action self._state = np.int32(self._state) if self._state < self._final_state: return ts.transition(self._state, 1.) else: return ts.termination(self._state, 1.)
def _step(self, action): if self._episode_ended: # The last action ended the episode. Ignore the current action and start a new episode. return self.reset() # Make sure episodes don't go on forever. instructionint = action[0] self.game.instruction = gl.decode[instructionint] status = self.game.dqn_update() if status == "end_episode" or self.game.stepcount == 1000: self._episode_ended = True reward = self.game.stepcount + 10 * self.game.score #score for clearing lines is weighted 10x compared to just keeping the game going for one more move if self._episode_ended: return ts.termination(self.game.screenmat.astype("int32"), reward) else: return ts.transition(self.game.screenmat.astype("int32"), reward, discount=1.0)
def _step(self, action): if self._episode_ended: self._send(Action(value=action, phase=self._phase)) return self.reset() self._send(Action(value=action, phase=self._phase)) message = self._receive() finished = message.finished reward = message.reward if finished: self._episode_ended = True return ts.termination(np.array(self._state, dtype=np.float), reward) observation = np.array(message.observable, dtype=np.float) return ts.transition(observation, reward, discount=self.discount)
def testCriticLossWithMaskedActions(self): # Observations are now a tuple of the usual observation and an action mask. observation_spec_with_mask = (self._obs_spec, tensor_spec.BoundedTensorSpec([2], tf.int32, 0, 1)) time_step_spec = ts.time_step_spec(observation_spec_with_mask) dummy_categorical_net = DummyCategoricalNet(self._obs_spec) agent = categorical_dqn_agent.CategoricalDqnAgent( time_step_spec, self._action_spec, dummy_categorical_net, self._optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) # For `observations`, the masks are set up so that only one action is valid # for each element in the batch. observations = (tf.constant([[1, 2], [3, 4]], dtype=tf.float32), tf.constant([[1, 0], [0, 1]], dtype=tf.int32)) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # For `next_observations`, the masks are set up so the opposite actions as # before are valid. next_observations = (tf.constant([[5, 6], [7, 8]], dtype=tf.float32), tf.constant([[0, 1], [1, 0]], dtype=tf.int32)) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Due to the constant initialization of the DummyCategoricalNet, we can # expect the same loss every time. Note this is different from the loss in # testCriticLoss above due to previously optimal actions being masked out. expected_loss = 5.062895 loss_info = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) evaluated_loss = self.evaluate(loss_info).loss self.assertAllClose(evaluated_loss, expected_loss, atol=1e-4)
def step(self, action): """Apply action and return new time_step.""" data = [] for df in self.dfs: data.append(np.array([df['volume'].values[range(self.current_step-self.look_back_window, self.current_step)], df['open'].values[range(self.current_step-self.look_back_window, self.current_step)], df['high'].values[range(self.current_step-self.look_back_window, self.current_step)], df['low'].values[range(self.current_step-self.look_back_window, self.current_step)], df['close'].values[range(self.current_step-self.look_back_window, self.current_step)]])) self._state = np.array(data) #seperate our action list to different variables coin = action[0] action_type = action[1] amount = action[2]/10.0 #initiliaze the reward to 0 for each timestep reward = 0 if self.wallet[0]<0.01*self.initial_balance: self._episode_ended = True return ts.termination(self._state, reward) if action_type==0: #Buy coin current_price = data[coin][1, self.look_back_window-1] usd_val = amount*self.wallet[0] self.wallet[0] -= usd_val self.wallet[coin+1] += usd_val/current_price self.moves[coin].append(['buy', self.current_step, current_price]) if action_type==1: #Sell coin current_price = data[coin][1, self.look_back_window-1] coin_val = amount*self.wallet[coin+1] self.wallet[coin+1] -= coin_val self.wallet[0] += coin_val*current_price self.moves[coin].append(['sell', self.current_step, current_price]) #increment the step in our environment self.current_step+=1 #return the timestep as a transition containing the state, reward, and discount value return ts.transition(self._state, reward, 1.0)
def testLossWithChangedOptimalActions(self, agent_class): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class( self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([0, 1], dtype=tf.int32) action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) # Note that instead of [[5, 6], [7, 8]] as before, we now have -5 and -7. next_observations = tf.constant([[-5, 6], [-7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) experience = trajectories_test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # # For the target Q-values here, note that since we've replaced 5 and 7 with # -5 and -7, it is better to use action 1 with a kernel of [1, 1] instead of # action 0 with a kernel of [2, 1]. # Target Q-value for first next_observation: 1 * -5 + 1 * 6 + 1 = 2 # Target Q-value for second next_observation: 1 * -7 + 1 * 8 + 1 = 2 # TD targets: 10 + 0.9 * 2 = 11.8 and 20 + 0.9 * 2 = 21.8 # TD errors: 11.8 - 5 = 6.8 and 21.8 - 8 = 13.8 # TD loss: 6.3 and 13.3 (Huber loss subtracts 0.5) # Overall loss: (6.3 + 13.3) / 2 = 9.8 expected_loss = 9.8 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.global_variables_initializer()) self.assertAllClose(self.evaluate(loss), expected_loss)
def testCriticLoss(self, include_critic_entropy_term, reward_noise_variance, use_tf_variable, td_targets): if use_tf_variable: reward_noise_variance = tf.Variable(reward_noise_variance) agent = cql_sac_agent.CqlSacAgent( self._time_step_spec, self._action_spec, critic_network=DummyCriticNet(), actor_network=None, actor_optimizer=None, critic_optimizer=None, alpha_optimizer=None, cql_alpha=1.0, num_cql_samples=1, include_critic_entropy_term=include_critic_entropy_term, use_lagrange_cql_alpha=False, reward_noise_variance=reward_noise_variance, actor_policy_ctor=DummyActorPolicy) observations = tf.constant([[1, 2], [3, 4]], dtype=tf.float32) time_steps = ts.restart(observations, batch_size=2) actions = tf.constant([[5], [6]], dtype=tf.float32) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = tf.constant([[5, 6], [7, 8]], dtype=tf.float32) next_time_steps = ts.transition(next_observations, rewards, discounts) pred_td_targets = [7., 10.] self.evaluate(tf.compat.v1.global_variables_initializer()) # Expected critic loss has factor of 2, for the two TD3 critics. expected_loss = self.evaluate( 2 * tf.compat.v1.losses.mean_squared_error( tf.constant(td_targets), tf.constant(pred_td_targets))) loss = agent._critic_loss_with_optional_entropy_term( time_steps, actions, next_time_steps, td_errors_loss_fn=tf.math.squared_difference) self.evaluate(tf.compat.v1.global_variables_initializer()) loss_ = self.evaluate(loss) self.assertAllClose(loss_, expected_loss)
def _step(self, actions): next_actions = transform_actions(actions, self.obs, self.config) self.last_obs = self.obs self.obs, reward, done, info = self.env.step(next_actions) x_obs = transform_observation(self.obs, self.config) x_reward = transform_reward(done, self.last_obs, self.obs, self.config) # final if x_reward <= REWARD_LOST: done, info = True, {} return_object = ts.termination(np.array(x_obs, dtype=np.int32), x_reward) return return_object else: return_object = ts.transition(np.array(x_obs, dtype=np.int32), reward=x_reward, discount=1.0) return return_object
def _step(self, action): if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() if self._player.pod.nextCheckId > 1 or self._player.pod.turns > 100: # That's enough for training... self._episode_ended = True else: # Play the given action self._player.controller.set_play(action, self._player.pod) self._player.step(self._board) if self._episode_ended: return ts.termination(self._to_observation(), self._get_reward()) else: return ts.transition(self._to_observation(), reward = self._get_reward(), discount = np.asarray(100, dtype=np.float32))
def _step(self, action): prices = self.productsCosts * action observation = np.round( (self.placeSize * self.productsUsualBuyingRates) * (self.productsPriceFlexibility** ((self.productsUsualPrices - prices) / self.productsUsualPrices))) marginPerProduct = (prices - self.productsCosts) * observation reward = marginPerProduct.sum() # convert to numpy array of float32, otherwise not accepted by specs observation = np.array(observation, dtype=np.float32) if self._state < self.duration: self._state += 1 return ts.transition(observation, reward) else: return ts.termination(observation, reward)
def _step(self, action): if self._episode_ended: return self.reset() cumulative_reward = 0 cumulative_done = False obs, reward, done, info = self._env.step(action) cumulative_reward += reward self._state = obs if (done): cumulative_done = True if cumulative_done: self._episode_ended = True return ts.termination(self._state, reward) else: return ts.transition(self._state, reward, discount=0.98)
def _step(self, action): self._state = self.states[self.time_flow][:-1] raw_reward = self.states[self.time_flow][-1] reward = 1. + raw_reward * tf.sign(action) if self.delay: if self.delay_counter < self.delay_threshold: self.delay_counter += 1 self.total_reward *= (1. + (raw_reward * self.curr_position)) else: if self.curr_position == tf.sign(action): # action not changed self.total_reward *= (1. + (raw_reward * self.curr_position)) else: # action changed; so delay counter reset self.curr_position = tf.sign(action) self.actions.append( [self.curr_position.numpy(), self.time_flow]) self.delay_counter = 0 self.total_reward *= reward self.action_counter += 1 else: self.total_reward *= reward if self.curr_position * tf.sign(action) < 0: # position changed self.action_counter += 1 self.actions.append([tf.sign(action).numpy(), self.time_flow]) self.curr_position = tf.sign(action) # current position changed self.time_flow += 1 if self.time_flow == self.states.shape[0]: termination = ts.termination( np.array(self._state, dtype=np.float32), reward) print('action counter is ', self.action_counter) print('total reward during training', self.total_reward) # if self.eval==True: # print('action counter is ', self.action_counter) print('where actions have done?', self.actions) self.reset() return termination else: return ts.transition(np.array(self._state, dtype=np.float32), reward, discount=1.0)
def testLoss(self, agent_class, run_mode): if tf.executing_eagerly() and run_mode == context.graph_mode: self.skipTest('b/123778560') with run_mode(), tf.compat.v2.summary.record_if(False): q_net = DummyNet(self._observation_spec, self._action_spec) agent = agent_class(self._time_step_spec, self._action_spec, q_network=q_net, optimizer=None) observations = [tf.constant([[1, 2], [3, 4]], dtype=tf.float32)] time_steps = ts.restart(observations, batch_size=2) actions = [tf.constant([[0], [1]], dtype=tf.int32)] action_steps = policy_step.PolicyStep(actions) rewards = tf.constant([10, 20], dtype=tf.float32) discounts = tf.constant([0.9, 0.9], dtype=tf.float32) next_observations = [ tf.constant([[5, 6], [7, 8]], dtype=tf.float32) ] next_time_steps = ts.transition(next_observations, rewards, discounts) experience = test_utils.stacked_trajectory_from_transition( time_steps, action_steps, next_time_steps) # Using the kernel initializer [[2, 1], [1, 1]] and bias initializer # [[1], [1]] from DummyNet above, we can calculate the following values: # Q-value for first observation/action pair: 2 * 1 + 1 * 2 + 1 = 5 # Q-value for second observation/action pair: 1 * 3 + 1 * 4 + 1 = 8 # (Here we use the second row of the kernel initializer above, since the # chosen action is now 1 instead of 0.) # Q-value for first next_observation: 2 * 5 + 1 * 6 + 1 = 17 # Q-value for second next_observation: 2 * 7 + 1 * 8 + 1 = 23 # TD targets: 10 + 0.9 * 17 = 25.3 and 20 + 0.9 * 23 = 40.7 # TD errors: 25.3 - 5 = 20.3 and 40.7 - 8 = 32.7 # TD loss: 19.8 and 32.2 (Huber loss subtracts 0.5) # Overall loss: (19.8 + 32.2) / 2 = 26 expected_loss = 26.0 loss, _ = agent._loss(experience) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss), expected_loss)
def _step(self, action): #if done, reset if (self._episode_ended): return self.reset() #behavior for valid input if (action == 0 or self._state[8] != 0): self._episode_ended = True #stand or max of 4 hits elif (action == 1): newCard = self.drawNewCard() self._state[0] += newCard #start looking at 4th spot since first 3 are set by default for i in range(3,9): #once you find the first empty card slot, add new card then break if(self._state[i] == 0): self._state[i] = newCard break #when over, change any 11s(aces) to 1s (skip state[0] because its a sum) if(self._state[0] > 21): for i in range(1,9): if(self._state[i] == 11): self._state[i] = 1 #fix card self._state[0] -= 10 #fix sum #break if successfully reduced if(self._state[0] <= 21): break else: raise ValueError('Invalid action, non-binary action detected') #if game is over, grant rewards, otherwise just transition if (self._episode_ended or self._state[0] >= 21): #score - 21 is reward value, if bust, -21. If sum is <11 resultReward = self._state[0] - 21 if self._state[0] <= 21 else -21 if(self._state[0] <= 11): #less than 12 (could have safely hit, so this is bad) resultReward = -100 return ts.termination(np.array([self._state], dtype=np.int32), resultReward) else: return ts.transition(np.array([self._state], dtype=np.int32), reward = 0.0, discount = 1.0)
def _step(self, action): if self._game.is_episode_finished(): # The last action ended the episode. Ignore the current action and start a new episode. return self.reset() # construct one hot encoded action as required by ViZDoom one_hot = [0] * self._num_actions one_hot[action] = 1 # execute action and receive reward reward = self._game.make_action(one_hot) # return transition depending on game state if self._game.is_episode_finished(): return time_step.termination(self.get_screen_buffer_preprocessed(), reward) else: return time_step.transition(self.get_screen_buffer_preprocessed(), reward)
def _step(self, action: types.NestedArray) -> ts.TimeStep: if self._current_time_step.is_last(): return self.reset() obs, legal_moves, rewards, done = self._env.step(action) self._cumulative_rewards += rewards observations_and_legal_moves = { 'observations': obs, 'cumulative_rewards': self._cumulative_rewards, 'legal_moves': legal_moves } reward = self._utiility_function(self._cumuluative_rewards) if done: return ts.termination(observations_and_legal_moves, reward) else: return ts.transition(observations_and_legal_moves, reward, self.gamma)
def _step(self, player_actions): if self._episode_ended: # print("game already ended resetting") return self.reset() player_index = self.current_player_index player_action = Action(int(player_actions[0])) structure_index = int(player_actions[1]) player = self.players[player_index] player_deck = self.player_deck(player_index) success_reward_modifier = 0 if structure_index >= len(player_deck): structure_index = len(player_deck) - 1 structure = player_deck.pop(structure_index) try: if player_action == Action.BUILD_STRUCTURE: player.build_structure(structure) elif player_action == Action.BUILD_WONDER_STAGE: player.build_wonder_stage() elif player_action == Action.DISCARD: player.discard_structure() # print("Player " + str(player_index) + " choose to " + player_action.name + " " + structure['name']) except ImpossibleBuildException: player.discard_structure() success_reward_modifier = -3 self.finish_player_turn() observation = self.to_observation() reward = self.calculate_score_difference( player_index) + success_reward_modifier if self._episode_ended: # print("game terminated at age " + str(self.age) + " reward " + str(reward)) return time_step.termination(observation, reward) else: # print("transition player " + str(self.current_player_index) + " action " + str(player_action.name) # + " turn " + str(self.turn) + " age " + str(self.age) + " reward " + str(reward)) return time_step.transition(observation, reward)
def Visualize(self, num_episodes=1): self._agent._training = False for _ in range(0, num_episodes): state = self._environment.reset() is_terminal = False while not is_terminal: action_step = self._agent._eval_policy.action( ts.transition(state, reward=0.0, discount=1.0)) action_shape = action_step.action.shape expected_shape = self._agent._eval_policy.action_spec.shape action = action_step.action.numpy() if action_shape != expected_shape: logging.warning("Action shape" + str(action_shape) + \ " does not match with expected shape " + str(expected_shape) +\ " -> reshaping is tried") action = np.reshape(action, expected_shape) logging.info(action) state, reward, is_terminal, _ = self._environment.step(action) self._environment.render()
def _step(self, action): trade_seq = np.argsort(np.floor(100*action[N:,:])).reshape(N, N) trade_ratio = action[:N, :] # iterate over all possible buying/selling pairs for i in np.arange(0,N*N): # determine buyer seller and amount to trade buyer = int(trade_seq/N) seller = trade_seq%N trade_ratio_current = trade_ratio[buyer,seller] # stock to be taded s_tbt = min(np.floor(self._state[self._step_counter, buyer]/D), self._state[N+self._step_counter, seller]) w_tbt = s_tbt*D self._state[self._step_counter, buyer] -= w_tbt self._state[N+self._step_counter, buyer] += s_tbt self._state[self._step_counter, seller] += w_tbt self._state[N+self._step_counter, seller] -= s_tbt if self._step_counter == 10: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() self._episode_ended = True # Make sure episodes don't go on forever.kk if action == 1: self._episode_ended = True elif action == 0: new_card = np.random.randint(1, 11) self._state += new_card else: raise ValueError('`action` should be 0 or 1.') if self._episode_ended or self._state >= 21: reward = np.sum(self._state[N-1,:]) + d*(1+r)/r*np.sum(self._state[2*N,:]) return ts.termination(np.array([self._state], dtype=np.int32), reward) else: return ts.transition( np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
def test_resets_after_limit(self): max_steps = 5 base_env = mock.MagicMock() wrapped_env = atari_wrappers.AtariTimeLimit(base_env, max_steps) base_env.gym.game_over = False base_env.reset.return_value = ts.restart(1) base_env.step.return_value = ts.transition(2, 0) action = 1 for _ in range(max_steps + 1): wrapped_env.step(action) self.assertTrue(wrapped_env.game_over) self.assertEqual(1, base_env.reset.call_count) wrapped_env.step(action) self.assertFalse(wrapped_env.game_over) self.assertEqual(2, base_env.reset.call_count)
def _success(self, action, action_letter): for index, l in enumerate(self.word_to_guess): if action == (ord(l) - 97): self._state[index] = action # 26 correspond to not found if 26 not in self._state: self._episode_ended = True logging.debug(f"You Found {self.word_to_guess}") return ts.termination( np.array([self._state], dtype=np.int32), self.number_of_life * self.reward_map["game_success_reward"], ) else: self.render() return ts.transition( np.array([self._state], dtype=np.int32), reward=self.reward_map["guess_success_reward"], discount=1.0, )