def testTerminationMultiRewards(self): observation = np.array([[-1], [-1]]) reward = [np.array([[2.], [2.]]), np.array([[3., 3.], [4., 4.]])] time_step = ts.termination(observation, reward) self.assertItemsEqual([ts.StepType.LAST] * 2, time_step.step_type) self.assertItemsEqual(observation, time_step.observation) self.assertAllEqual(reward[0], time_step.reward[0]) self.assertAllEqual(reward[1], time_step.reward[1]) self.assertItemsEqual([0., 0.], time_step.discount) reward = np.array([[2., 2., 2.], [3., 3., 3.]]) reward_spec = [array_spec.ArraySpec((3,), np.float32, 'multi_r')] outer_dims = nest_utils.get_outer_array_shape(reward, reward_spec) time_step_batch = ts.termination(observation, reward, outer_dims) # Check that passing outer_dims works self.assertItemsEqual([ts.StepType.LAST] * 2, time_step_batch.step_type) self.assertItemsEqual(observation, time_step_batch.observation) self.assertAllEqual(reward[0], time_step_batch.reward[0]) self.assertAllEqual(reward[1], time_step_batch.reward[1]) self.assertItemsEqual([0., 0.], time_step_batch.discount) # Check that it gets a different result with no outer_dims time_step_no_batch = ts.termination(observation, reward, outer_dims=[]) self.assertEqual(ts.StepType.LAST, time_step_no_batch.step_type) self.assertItemsEqual(observation, time_step_no_batch.observation) self.assertAllEqual(reward[0], time_step_no_batch.reward[0]) self.assertAllEqual(reward[1], time_step_no_batch.reward[1]) self.assertEqual(0., time_step_no_batch.discount)
def _step(self, action): self._counter += 1 if self._done: self.reset() if action == 0: if self._state[0] < 3: self._state[0] += 1 elif action == 1: if self._state[0] > 0: self._state[0] -= 1 elif action == 2: if self._state[1] < 10: self._state[1] += 1 elif action == 3: if self._state[1] > 0: self._state[1] -= 1 else: raise ValueError(f'Unrecognized action {action}') if self._counter >= 100: self._done = True return ts.termination(self._state, reward=-1) if self._state[0] == 0 and 1 <= self._state[1] <= 9: self._done = True return ts.termination(self._state, reward=-100) elif self._state[0] == 0 and self._state[1] == 10: self._done = True return ts.termination(self._state, reward=0) else: return ts.transition(self._state, reward=-1, discount=1.0)
def _step(self, action_index) -> ts.TimeStep: """ the environment steps forward by taking the action :param action_index: index of the action in the action mapper :return: TimeStep object after taking the action """ if self._game_ended: return self.reset() # illegal action terminates the game, todo: remove if self._legal_action_mask[action_index] == 0: self._game_ended = True return ts.termination(self._get_observation(), GameEnv.REWARD_ILLEGAL_ACTION) # update grid data after the swap row1, col1, row2, col2 = self.action_mapper[action_index.item()] self._states[row1, col1, GameEnv.STEP], self._states[row2, col2, GameEnv.STEP] = \ self._states[row2, col2, GameEnv.STEP] - 1, self._states[row1, col1, GameEnv.STEP] - 1 self._states[row1, col1, GameEnv.TARGET], self._states[row2, col2, GameEnv.TARGET] = \ self._states[row2, col2, GameEnv.TARGET], self._states[row1, col1, GameEnv.TARGET] self._update_legal_actions() self._update_complete_state(row1, col1, row2, col2) # check game state and reward state, reward = self._check_state() if state != GameEnv.STATE_ONGOING: self._game_ended = True return ts.termination(self._get_observation(), reward) return ts.transition(self._get_observation(), reward, discount=self._discount)
def _step(self, action): if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() if self.current_time_step().is_first(): # take turns until the AI player needs take its turn self._clue.take_turns_until_player(self._ai_player) if self._clue.game_status == GameStatus.ENDED: # someone won, terminate self._episode_ended = True return ts.termination(self._state, self._calc_reward()) # take agent turn self._turn(action) # take turns until the AI player needs take its turn self._clue.take_turns_until_player(self._ai_player) if self._clue.game_status == GameStatus.ENDED: # someone won, terminate self._episode_ended = True return ts.termination(self._state, self._calc_reward()) self._update_state() if self._clue.game_status == GameStatus.ENDED or self._tries == self._max_tries: # AI player won self._episode_ended = True return ts.termination(self._state, self._calc_reward()) return ts.transition(self._state, reward=0.0, discount=1.0)
def _step(self, action): if keyboard.is_pressed('p'): sys.exit() # if reached max playtime or game is minimized or game is not running: ignore action and return last timestep if time.time( ) - self.start_time > self.max_playtime or not self.process_handler.is_running( ) or self.screen_handler.is_minimized(): return ts.termination(self._observe(), 0) self._action(action) reward = self._get_reward() if reward is None: return ts.termination(self._observe(), 0) # update training timestamps if reward > 0: self.last_progress_time = time.time() # if not progressing: stop session if time.time() - self.last_progress_time > self.score_timeout: return ts.termination(self._observe(), reward) return ts.transition(self._observe(), reward)
def _step(self, action: types.NestedArray) -> ts.TimeStep: if self._pasture_engine.goal_achieved: return self.reset() action = action.item() direction = self._action_values.get(action) was_move_legit = self._pasture_engine.move_shepherd(direction) if not was_move_legit: return ts.termination(self._pasture_engine.state(), -10) # UPDATE SHEEP HERE self._pasture_engine.sheep_controller.move_animals() # CALCULATE REWARD DEPENDING ON SHEEP POSITIONING gcm_diff = self._pasture_engine.calc_gcm_difference() gcm_diff_delta = abs(gcm_diff - self.past_gcm_diff) lower, upper = 0, 1 reward = lower + (upper - lower) * gcm_diff_delta #print('reward: ' + str(reward)) if self._pasture_engine.goal_achieved: return ts.termination(self._pasture_engine.state(), 20) self.past_gcm_diff = gcm_diff return ts.transition(self._pasture_engine.state(), reward=reward - 0.2, discount=1.0)
def _step(self, action): if self._game.game_ended(): return self.reset() action = action.item() next_agent_position_direction = self._action_values.get(action) current_agent_position = np.where(self._game.game_state() == 1)[0].item() new_agent_position = current_agent_position + next_agent_position_direction response = self._game.move_dog(current_agent_position,new_agent_position) if response == ActionResult.GAME_COMPLETE: return timeStep.termination(self._game.game_state(), 10) elif response == ActionResult.ILLEGAL_MOVE: return timeStep.termination(self._game.game_state(), -0.3) elif response == ActionResult.FOUND_ROBOT: return timeStep.termination(self._game.game_state(), -0.3) elif response == ActionResult.FOUND_BONE: return timeStep.transition(self._game.game_state(), reward=1, discount=1.0) return timeStep.transition(self._game.game_state(), reward=-0.3, discount=1.0)
def _step(self, action): if self._episode_ended: return self.reset() self._step_count += 1 obs, reward, terminal = self._game.step(action) obs = obs.flatten() if terminal: self._episode_ended = True self._reward_count += reward # Stop if we have gotten 1000 treats if self._reward_count >= 1000: return ts.termination(obs, reward) # Reset how long we have to live if we get a treat if reward != 0: self._step_count = 0 if self._step_limit is not None and self._step_count > self._step_limit: self._episode_ended = True if self._episode_ended: return ts.termination(obs, reward) return ts.transition(obs, reward, discount=1.0)
def _step(self, action): if self._game_state.winner != -1: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() action = int(action) player = self._game_state.players[self._game_state.player_to_move] if self._lose_on_illegal_move: if action not in player.hand: next_obs = self._get_observation() return ts.termination(next_obs, reward=-1.0) elif action not in player.hand: action = random.choice(list(player.hand.keys())) self._action_player_controller.set_action(action) game_over = self._game_state.step() while self._game_state.player_to_move != self._agent_player and game_over == -1: game_over = self._game_state.step() next_obs = self._get_observation() if game_over != -1: if game_over == 0: reward = 0.0 elif game_over - 1 == self._agent_player: reward = 1.0 elif game_over - 1 == 1 - self._agent_player: reward = -1.0 else: assert False return ts.termination(next_obs, reward=reward) return ts.transition(next_obs, reward=0.0)
def _step(self, action): """Apply action and return new time_step.""" if self._hit_count == self._plane_size: self._episode_ended = True return self.reset() if self._strike_count + 1 == self._max_steps: self.reset() return ts.termination( np.array(self._visible_board, dtype=np.float32), UNFINISHED_GAME_REWARD) self._strike_count += 1 action_x = action // self._board_size action_y = action % self._board_size # Hit if self._hidden_board[action_x][ action_y] == HIDDEN_BOARD_CELL_OCCUPIED: # Non-repeat move if self._visible_board[action_x][ action_y] == VISIBLE_BOARD_CELL_UNTRIED: self._hit_count += 1 self._visible_board[action_x][ action_y] = VISIBLE_BOARD_CELL_HIT # Successful strike if self._hit_count == self._plane_size: # Game finished self._episode_ended = True return ts.termination( np.array(self._visible_board, dtype=np.float32), FINISHED_GAME_REWARD, ) else: self._episode_ended = False return ts.transition( np.array(self._visible_board, dtype=np.float32), HIT_REWARD, self._discount, ) # Repeat strike else: self._episode_ended = False return ts.transition( np.array(self._visible_board, dtype=np.float32), REPEAT_STRIKE_REWARD, self._discount, ) # Miss else: # Unsuccessful strike self._episode_ended = False self._visible_board[action_x][action_y] = VISIBLE_BOARD_CELL_MISS return ts.transition( np.array(self._visible_board, dtype=np.float32), MISS_REWARD, self._discount, )
def _step(self, action): self.time_stamp += 1 if self.time_stamp >= self.max_stamp: self._episode_ended = True else: self._episode_ended = False if self._episode_ended: self.max_fidelity = 0 return ts.termination(np.array([0, 0, 0, 0, 0, 0, 0], dtype=np.float32), 0) H, c_ops = self.get_hamiltonian(action) # newly added c_ops = [] # print(self.gamma) if self.gamma[0] > 0.0: H += np.sqrt(self.gamma[0]) * sigmam() if self.gamma[1] > 0.0: H += np.sqrt(self.gamma[1]) * sigmaz() ###### t = np.arange(0, self.get_interval_width(), .01) transition_state = mesolve(H, self._state, t, c_ops=c_ops) self._state = transition_state.states[-1] new_fidelity = self.get_transition_fidelity(self._state) #reward = 2*new_fidelity - self.fidelity - self.max_fidelity #reward = reward if reward > 0 else 0 reward = new_fidelity - self.fidelity self.fidelity = new_fidelity self.max_fidelity = new_fidelity if new_fidelity > self.max_fidelity else self.max_fidelity observation = [self.fidelity, expect(sigmax(), self._state), expect(sigmay(), self._state), expect(sigmaz(), self._state), reward, action[0], action[1] ] if (self.fidelity > 0.9995 or self.time_stamp >= self.max_stamp): self._episode_ended = True self.max_fidelity = 0 self.fidelity = 0 #self.gamma = [random.uniform(0, 1), random.uniform(0, 1)] return ts.termination(np.array(observation, dtype=np.float32), reward=reward) else: return ts.transition( np.array(observation, dtype=np.float32), reward=reward, discount=0.9)
def _step(self, action): """Progress the agent one step in the environment. The agent moves until the reward is decreasing. The number of sequences that can be evaluated at each episode is capped to `self.max_num_steps`. """ # if we've exceeded the maximum number of steps, terminate if self.num_steps >= self.max_num_steps: return ts.termination(self._state, 0) # `action` is an integer representing which residue to mutate to 1 # along the flattened one-hot representation of the sequence pos = action // len(self.alphabet) res = action % len(self.alphabet) self.num_steps += 1 # if we are trying to modify the sequence with a no-op, then stop if self._state["sequence"][pos, res] == 1: return ts.termination(self._state, 0) self._state["sequence"][pos] = 0 self._state["sequence"][pos, res] = 1 state_string = s_utils.one_hot_to_string(self._state["sequence"], self.alphabet) if self.fitness_model_is_gt: self._state["fitness"] = self.landscape.get_fitness( [state_string]).astype(np.float32) else: self._state["fitness"] = self.model.get_fitness( [state_string]).astype(np.float32) self.all_seqs[state_string] = self._state["fitness"].item() reward = self._state["fitness"].item( ) - self.lam * self.sequence_density(state_string) # if we have seen the sequence this episode, # terminate episode and punish # (to prevent going in loops) if state_string in self.episode_seqs: return ts.termination(self._state, -1) self.episode_seqs.add(state_string) # if the reward is not increasing, then terminate if reward < self.previous_fitness: return ts.termination(self._state, reward=reward) self.previous_fitness = reward return ts.transition(self._state, reward=reward)
def _step(self, action): if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() row = self.find_row(action) # Make sure it is a valid move if row is None: self._episode_ended = True return ts.termination(np.array(self._state1, dtype=np.int32), reward=-10) # Update the boards win, full = self.update(self.IDENTIFIER, self.OPPOSITE_IDENTIFIER, action, row) if win: return ts.termination(np.array(self._state1, dtype=np.int32), reward=1) elif full: return ts.termination(np.array(self._state1, dtype=np.int32), reward=0.5) # Make a second action agent2_time_step = self._tf_time_step() if self.agent2_policy is None: raise Exception('Error!, agent2 dose not exists') action = self.agent2_policy.action(agent2_time_step).action # Update the boards win, full = self.update(self.OPPOSITE_IDENTIFIER, self.IDENTIFIER, action) if win: return ts.termination(np.array(self._state1, dtype=np.int32), reward=-1) elif full: return ts.termination(np.array(self._state1, dtype=np.int32), reward=0.5) time_step = ts.transition(np.array(self._state1, dtype=np.int32), reward=0.01, discount=1.0) # print("temp:", temp_time_step) # print("tf_temp:", tf_temp_time_step) # print("real:", time_step) return time_step
def _step(self, action): if self._episode_ended: return self.reset() if action == 1: self._n_queries += 1 self.reward = -0.1 elif action == 0: # do not query pass else: raise ValueError('action should be 0 or 1.') image = self.dm.get_x_unl( self._counter) # !TODO change to iterable object # prepare next observation (aka next state) final_output, intermediate_output = self.aux_model.predict( np.array([image])) self._state = np.append(intermediate_output, final_output) print('STATE:', self._state.shape) self._counter += 1 # end of episode if self._n_queries >= MAX_QUERIES or self._counter >= len( self.dm.x_unl): self.reward = 1.0 # !TODO return ts.termination(np.array(self._state, dtype=np.int32), self.reward) else: self.reward = 0.5 return ts.transition(np.array(self._state, dtype=np.float32), reward=self.reward, discount=1.0)
def _step(self, action_input): if self._reset_next_step: return self.reset() action = action_input.copy() self._curr_step += 1 new_state = deepcopy(self._state) noise = np.dot(self._ValFct.alphaAtZ(self._wealth), np.random.normal(scale=1./252,size=self._dRiskyAsset))/(self._wealth/self._scale) new_state += noise new_state += action if not array_spec.check_arrays_nest(new_state, self._observation_spec): reward_step = -2 else: self._state = new_state reward_step = self._reward_fn(action*self._wealth/self._scale) self._avg_reward += self._learningRate_AR*(reward_step-self._avg_reward) reward_step -= self._avg_reward reward_step = np.clip(self._reward_scale*reward_step, -1., 1.) if self._curr_step >= self._Nsteps: self._reset_next_step = True print('EPISODE AVERAGE REWARD: ', self._avg_reward) return ts.termination(observation=self._state, reward=reward_step) return ts.transition(observation=self._state, reward=reward_step, discount=1.)
def _step(self, action): '''Step function, can be called with env._step()''' # TODO: complete implementation of flood logic if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() # Make sure episodes don't go on forever. self._state.current_color = action self._state.owned_blocks += np.random.randint( 1, self._state.width * self._state.height - self._state.owned_blocks + 1) self._state.num_turns += 1 if self._episode_ended or self._state.owned_blocks >= self._state.height * self._state.width: reward = -1 * self._state.num_turns return ts.termination( np.array(self._state.state_array, dtype=np.int32), reward) else: return ts.transition(np.array(self._state.state_array, dtype=np.int32), reward=0.0, discount=1)
def _step(self, actions): """Progress the agent one step in the environment.""" actions = actions.flatten() self.states[:, self.partial_seq_len, -1] = 0 self.states[np.arange(self._batch_size), self.partial_seq_len, actions] = 1 self.partial_seq_len += 1 # We have not generated the last residue in the sequence, so continue if self.partial_seq_len < self.seq_length - 1: return nest_utils.stack_nested_arrays( [ts.transition(seq_state, 0) for seq_state in self.states]) # If sequence is of full length, score the sequence and end the episode # We need to take off the column in the matrix (-1) representing the mask token complete_sequences = [ s_utils.one_hot_to_string(seq_state[:, :-1], self.alphabet) for seq_state in self.states ] if self.fitness_model_is_gt: fitnesses = self.landscape.get_fitness(complete_sequences) else: fitnesses = self.model.get_fitness(complete_sequences) self.all_seqs.update(zip(complete_sequences, fitnesses)) # Reward = fitness - lambda * sequence density rewards = np.array([ f - self.lam * self.sequence_density(seq) for seq, f in zip(complete_sequences, fitnesses) ]) return nest_utils.stack_nested_arrays([ ts.termination(seq_state, r) for seq_state, r in zip(self.states, rewards) ])
def _step(self, action): if self._episode_ended: return self._reset() reward = 0 if not (action == 1 or action == 0): raise ValueError('`action` should be 0 or 1.') self._state += 1 self.observation += 1 if action == 0: reward = 4 / self._time_between_squiggles_beats elif action == 1: if (self._state - 1) % self._time_between_squiggles_beats == 0: reward = 10 if self.observation[0] == 1: reward = 5 else: reward = -4 / self._time_between_squiggles_beats if action == 1: # Clicking agent action to discrete squiggle hearing current_closeness_to_real_beat = self._state % self._time_between_squiggles_beats closest_beat = self._state - current_closeness_to_real_beat current_closeness_to_real_beat -= self._time_between_squiggles_beats current_i = int( (closest_beat / self._time_between_squiggles_beats) % 16) self._squiggles_input[current_i] = 1 # Is it time for squiggles to get update? if self._state % (self._time_between_squiggles_beats * 16) == 0: self._squiggles_list[0].update_h(self._squiggles_input) for i in range(1, len(self._squiggles_list)): self._squiggles_list[i - 1].update_o() out_in = self._squiggles_list[i - 1].o self._squiggles_list[i].update_h(out_in) self._squiggles_list[-1].update_o() #print(self._squiggles_input) self._squiggles_input = [0 for i in range(16)] play = False if self._state % self._time_between_squiggles_beats == 0 and self._squiggles_list[ -1].o[int((self._state / self._time_between_squiggles_beats) % 16)] == 1: play = True # Is it time for squig to play, and did it? if self._state%self._time_between_squiggles_beats == 0 and \ self._squiggles_list[-1].o[int((self._state/self._time_between_squiggles_beats)%16)] == 1: self.observation[1:] = self.observation[:-1] self.observation[0] = 0 if self._state >= self._states_until_termination: self._episode_ended = True return ts.termination(self.observation, reward) return ts.transition(self.observation, reward=reward, discount=0.9)
def testAutomaticReset(self): observation_spec = tensor_spec.TensorSpec([1], tf.float32) action_spec = tensor_spec.TensorSpec([1], tf.float32) policy_state_spec = tensor_spec.TensorSpec([1], tf.float32) time_step_spec = ts.time_step_spec(observation_spec) policy = TfPassThroughPolicy( time_step_spec, action_spec, policy_state_spec=policy_state_spec, automatic_state_reset=True) observation = tf.constant(1, dtype=tf.float32, shape=(1, 1)) reward = tf.constant(1, dtype=tf.float32, shape=(1,)) time_step = tf.nest.map_structure(lambda *t: tf.concat(t, axis=0), ts.restart(observation, batch_size=1), ts.transition(observation, reward), ts.termination(observation, reward)) state = self.evaluate( policy.action(time_step, policy_state=policy.get_initial_state(3) + 1).state) self.assertEqual(0, state[0]) self.assertEqual(1, state[1]) self.assertEqual(1, state[2]) state = self.evaluate( policy.distribution( time_step, policy_state=policy.get_initial_state(3) + 1).state) self.assertEqual(0, state[0]) self.assertEqual(1, state[1]) self.assertEqual(1, state[2])
def _success(self, action): for index, l in enumerate(self.word_to_guess): if action == (ord(l) - 97): self._state[index] = action legal_moves = self._format_legal_moves(self.guessed_letters) observations_and_legal_moves = { 'observations': np.array([self._state], dtype=np.float32), 'legal_moves': legal_moves } # 26 correspond to not found if 26 not in self._state: self._episode_ended = True logging.debug(f"You Found {self.word_to_guess}") print(f"You Found {self.word_to_guess}") return ts.termination( observations_and_legal_moves, self.number_of_life * self.reward_map["game_success_reward"], ) else: self.render() return ts.transition( observations_and_legal_moves, reward=self.reward_map["guess_success_reward"], discount=1.0, )
def _step(self, action): try: prices = self.productsCosts * action[0] observation = np.round( (self.placeSize * self.productsUsualBuyingRates) * (self.productsPriceFlexibility**( (self.productsUsualPrices - prices) / self.productsUsualPrices))) marginPerProduct = (prices - self.productsCosts) * observation reward = marginPerProduct.sum() if self._state < 10: reward *= 1 + self._state * 0.1 # convert to numpy array of float32, otherwise not accepted by specs observation = np.array(np.stack( (observation, np.zeros((self.size), dtype=np.float32))), dtype=np.float32) if self._state < self.duration: self._state += 1 return ts.transition(observation, reward) else: return ts.termination(observation, reward) except Exception as e: logging.error(e) raise ("ERROR")
def _get_ts_invalid_move(self, move_result: ResultOfAMove): reward = -20 self._episode_ended = True return_ts = \ ts.termination(self._get_observation(self._player_id), reward=reward) return return_ts
def _step(self, action): # Automatically reset the environments on step if they need to be reset. if self._handle_auto_reset and self._done: return self.reset() # Some environments (e.g. FrozenLake) use the action as a key to the # transition probability so it has to be hashable. In the case of discrete # actions we have a numpy scalar (e.g array(2)) which is not hashable # in this case, we simply pull out the scalar value which will be hashable. try: action = action.item() if self._action_is_discrete else action except AttributeError: action = action[0] # Remove ListWrapper for single-agent compatibility observation, reward, self._done, self._info = self._gym_env.step(action) if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) reward = np.asarray(reward, dtype=self.reward_spec().dtype) outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec()) if self._done: return ts_lib.termination(observation, reward, outer_dims=outer_dims) else: return ts_lib.transition(observation, reward, self._discount, outer_dims=outer_dims)
def _step(self, action): # Automatically reset the environments on step if they need to be reset. if self._auto_reset and self._done: return self.reset_agent() action = action.item() if self._action_is_discrete else action observation, reward, self._done, self._info = self._gym_env.step( action) if self._match_obs_space_dtype: observation = self._to_obs_space_dtype(observation) reward = np.asarray(reward, dtype=self.reward_spec().dtype) outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec()) if self._done: return ts_lib.termination(observation, reward, outer_dims=outer_dims) else: return ts_lib.transition(observation, reward, self._discount, outer_dims=outer_dims)
def _step(self, action): try: if action.shape != self.productsCosts.shape: #action shape doesn't match action_spec. It matches observation_spec. Bug or usual for SAC agent? #print(action.shape) action = np.sum(action, axis=0) prices = self.productsCosts * action quantities = np.round( (self.placeSize * self.productsUsualBuyingRates) * (self.productsPriceFlexibility**( (self.productsUsualPrices - prices) / self.productsUsualPrices))) marginPerProduct = (prices - self.productsCosts) * quantities reward = marginPerProduct.sum() observation = np.stack( (self.productsCosts, self.productsUsualMarginRates, self.productsUsualBuyingRates, self.productsUsualPrices, self.productsPriceFlexibility, quantities)) # convert to numpy array of float32, otherwise not accepted by specs observation = np.array(observation, dtype=np.float32) if self._state < self.duration: self._state += 1 return ts.transition(observation, reward) else: return ts.termination(observation, reward) except: 1 / 0
def _step(self, action): if self.isEgoViolatedTraffic(): # The last action ended the episode. Ignore the current action and start a new episode. return self._reset() self._send_command(action) # needed for ros msg to sync. self.rate.sleep() self.ego.step() if action[0] <= 0.5 or self.isEgoViolatedTraffic(): reward = -1 self.total_reward = self.total_reward + reward else: reward = action[0] / self._action_spec.maximum[0] - abs( self.last_steering - action[1]) reward = reward / 1000.0 self.last_steering = action[1] self.total_reward = self.total_reward + reward # return transition depending on game state # self.render() if self.isEgoViolatedTraffic(): return time_step.termination(self.data_cam_front_rgb, reward) else: return time_step.transition(self.data_cam_front_rgb, reward)
def _step(self, action): if self._episode_ended: # The last action ended the episode. Ignore the current action and start # a new episode. return self.reset() #check if the move is valid and reward accordingly if 0 <= action <= 8: if self.isSpotEmpty(action): self._grid = self.mark reward = self.calcReward() print("agent goes in spot {} for reward {}", action, reward) else: #punish for picking a spot that has been picked reward = -10 self._episode_ended = True else: raise ValueError('`action` should be 0 - 8.') if not self.isGridFull(): self.takeOppTurn() else: self._episode_ended = True if self._episode_ended: return ts.termination(self._grid, reward) else: return ts.transition(self._grid, reward=2, discount=1.0)
def _step(self, action): if self._episode_ended: return self._reset() if self.sim_time < 4.05 and self.sim_time > 4.03: reward = -np.square(self._state[0])*0.001 self._episode_ended = True else: self._state, self.sim_time, self.s_road = Server(self.tcp_port).server_step(action) if self._state is None: return self._reset() reward = (self.s_road - self.s_road_old) #reward = self._state[0] * np.cos(self._state[3]) self.s_road_old = self.s_road if self.time_counter > self.max_EpSteps: self._episode_ended = True self.time_counter += 1 if self._episode_ended: return ts.termination(self._state, reward) else: return ts.transition( self._state, reward, discount=self.gamma)
def _step(self, action): if self._action_spec: nest_utils.assert_same_structure(self._action_spec, action) self._num_steps += 1 observation = self._get_observation() if self._num_steps < self._min_duration: self._done = False elif self._max_duration and self._num_steps >= self._max_duration: self._done = True else: self._done = self._rng.uniform() < self._episode_end_probability if self._done: reward = self._reward_fn(ts.StepType.LAST, action, observation) self._check_reward_shape(reward) time_step = ts.termination(observation, reward) self._num_steps = 0 else: reward = self._reward_fn(ts.StepType.MID, action, observation) self._check_reward_shape(reward) time_step = ts.transition(observation, reward, self._discount) return time_step
def _step(self, action: types.NestedArray) -> ts.TimeStep: if self._current_time_step.is_last(): return self.reset() state_obs, rewards, _ = self._env.step(action) done = self._is_done() self._cumulative_rewards += rewards self._obs_stacker.add_observation( state_obs / 255) # Normalizing obs in range [0, 1] stacked_obs = self._obs_stacker.get_observation_stack() obs = { 'state_obs': stacked_obs, 'utility_representation': self._utility_func.agent_utility_repr, } if self._cumulative_rewards_flag: obs['cumulative_rewards'] = self._cumulative_rewards # The scalar reward on which to train is equal to the delta in the utility between the # previous time step and the current one. current_utility = self._utility_func(self._cumulative_rewards) reward = current_utility - self._prev_step_utility self._prev_step_utility = current_utility if done: return ts.termination(obs, reward) else: return ts.transition(obs, reward, self.gamma)