예제 #1
0
  def testTerminationMultiRewards(self):
    observation = np.array([[-1], [-1]])
    reward = [np.array([[2.], [2.]]),
              np.array([[3., 3.], [4., 4.]])]
    time_step = ts.termination(observation, reward)

    self.assertItemsEqual([ts.StepType.LAST] * 2, time_step.step_type)
    self.assertItemsEqual(observation, time_step.observation)
    self.assertAllEqual(reward[0], time_step.reward[0])
    self.assertAllEqual(reward[1], time_step.reward[1])
    self.assertItemsEqual([0., 0.], time_step.discount)

    reward = np.array([[2., 2., 2.], [3., 3., 3.]])
    reward_spec = [array_spec.ArraySpec((3,), np.float32, 'multi_r')]
    outer_dims = nest_utils.get_outer_array_shape(reward, reward_spec)
    time_step_batch = ts.termination(observation, reward, outer_dims)

    # Check that passing outer_dims works
    self.assertItemsEqual([ts.StepType.LAST] * 2, time_step_batch.step_type)
    self.assertItemsEqual(observation, time_step_batch.observation)
    self.assertAllEqual(reward[0], time_step_batch.reward[0])
    self.assertAllEqual(reward[1], time_step_batch.reward[1])
    self.assertItemsEqual([0., 0.], time_step_batch.discount)

    # Check that it gets a different result with no outer_dims
    time_step_no_batch = ts.termination(observation, reward, outer_dims=[])
    self.assertEqual(ts.StepType.LAST, time_step_no_batch.step_type)
    self.assertItemsEqual(observation, time_step_no_batch.observation)
    self.assertAllEqual(reward[0], time_step_no_batch.reward[0])
    self.assertAllEqual(reward[1], time_step_no_batch.reward[1])
    self.assertEqual(0., time_step_no_batch.discount)
    def _step(self, action):
        self._counter += 1
        if self._done:
            self.reset()

        if action == 0:
            if self._state[0] < 3:
                self._state[0] += 1
        elif action == 1:
            if self._state[0] > 0:
                self._state[0] -= 1
        elif action == 2:
            if self._state[1] < 10:
                self._state[1] += 1
        elif action == 3:
            if self._state[1] > 0:
                self._state[1] -= 1
        else:
            raise ValueError(f'Unrecognized action {action}')

        if self._counter >= 100:
            self._done = True
            return ts.termination(self._state, reward=-1)

        if self._state[0] == 0 and 1 <= self._state[1] <= 9:
            self._done = True
            return ts.termination(self._state, reward=-100)
        elif self._state[0] == 0 and self._state[1] == 10:
            self._done = True
            return ts.termination(self._state, reward=0)
        else:
            return ts.transition(self._state, reward=-1, discount=1.0)
예제 #3
0
    def _step(self, action_index) -> ts.TimeStep:
        """
        the environment steps forward by taking the action
        :param action_index: index of the action in the action mapper
        :return: TimeStep object after taking the action
        """
        if self._game_ended:
            return self.reset()

        # illegal action terminates the game, todo: remove
        if self._legal_action_mask[action_index] == 0:
            self._game_ended = True
            return ts.termination(self._get_observation(),
                                  GameEnv.REWARD_ILLEGAL_ACTION)

        # update grid data after the swap
        row1, col1, row2, col2 = self.action_mapper[action_index.item()]
        self._states[row1, col1, GameEnv.STEP], self._states[row2, col2, GameEnv.STEP] = \
            self._states[row2, col2, GameEnv.STEP] - 1, self._states[row1, col1, GameEnv.STEP] - 1
        self._states[row1, col1, GameEnv.TARGET], self._states[row2, col2, GameEnv.TARGET] = \
            self._states[row2, col2, GameEnv.TARGET], self._states[row1, col1, GameEnv.TARGET]

        self._update_legal_actions()
        self._update_complete_state(row1, col1, row2, col2)

        # check game state and reward
        state, reward = self._check_state()
        if state != GameEnv.STATE_ONGOING:
            self._game_ended = True
            return ts.termination(self._get_observation(), reward)

        return ts.transition(self._get_observation(),
                             reward,
                             discount=self._discount)
예제 #4
0
    def _step(self, action):
        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()               

        if self.current_time_step().is_first():
            # take turns until the AI player needs take its turn
            self._clue.take_turns_until_player(self._ai_player)
        
            if self._clue.game_status == GameStatus.ENDED:
                # someone won, terminate
                self._episode_ended = True
                return ts.termination(self._state, self._calc_reward())
        
        # take agent turn
        self._turn(action)

        # take turns until the AI player needs take its turn
        self._clue.take_turns_until_player(self._ai_player)

        if self._clue.game_status == GameStatus.ENDED:
            # someone won, terminate
            self._episode_ended = True
            return ts.termination(self._state, self._calc_reward())

        self._update_state()

        if self._clue.game_status == GameStatus.ENDED or self._tries == self._max_tries:
            # AI player won
            self._episode_ended = True
            return ts.termination(self._state, self._calc_reward())

        return ts.transition(self._state, reward=0.0, discount=1.0)
예제 #5
0
    def _step(self, action):
        if keyboard.is_pressed('p'):
            sys.exit()

        # if reached max playtime or game is minimized or game is not running: ignore action and return last timestep
        if time.time(
        ) - self.start_time > self.max_playtime or not self.process_handler.is_running(
        ) or self.screen_handler.is_minimized():
            return ts.termination(self._observe(), 0)

        self._action(action)

        reward = self._get_reward()
        if reward is None:
            return ts.termination(self._observe(), 0)

        # update training timestamps
        if reward > 0:
            self.last_progress_time = time.time()

        # if not progressing: stop session
        if time.time() - self.last_progress_time > self.score_timeout:
            return ts.termination(self._observe(), reward)

        return ts.transition(self._observe(), reward)
예제 #6
0
    def _step(self, action: types.NestedArray) -> ts.TimeStep:
        if self._pasture_engine.goal_achieved:
            return self.reset()

        action = action.item()
        direction = self._action_values.get(action)

        was_move_legit = self._pasture_engine.move_shepherd(direction)

        if not was_move_legit:
            return ts.termination(self._pasture_engine.state(), -10)

        # UPDATE SHEEP HERE
        self._pasture_engine.sheep_controller.move_animals()

        # CALCULATE REWARD DEPENDING ON SHEEP POSITIONING
        gcm_diff = self._pasture_engine.calc_gcm_difference()
        gcm_diff_delta = abs(gcm_diff - self.past_gcm_diff)
        lower, upper = 0, 1
        reward = lower + (upper - lower) * gcm_diff_delta
        #print('reward: ' + str(reward))
        if self._pasture_engine.goal_achieved:
            return ts.termination(self._pasture_engine.state(), 20)

        self.past_gcm_diff = gcm_diff
        return ts.transition(self._pasture_engine.state(),
                             reward=reward - 0.2,
                             discount=1.0)
예제 #7
0
    def _step(self, action):    

        if self._game.game_ended():
            return self.reset()

        action = action.item()

        next_agent_position_direction = self._action_values.get(action)
        current_agent_position = np.where(self._game.game_state() == 1)[0].item()
        new_agent_position = current_agent_position + next_agent_position_direction

        response = self._game.move_dog(current_agent_position,new_agent_position)

        if response == ActionResult.GAME_COMPLETE:
            return timeStep.termination(self._game.game_state(), 10)

        elif response == ActionResult.ILLEGAL_MOVE:
            return timeStep.termination(self._game.game_state(), -0.3)

        elif response == ActionResult.FOUND_ROBOT:
            return timeStep.termination(self._game.game_state(), -0.3)

        elif response == ActionResult.FOUND_BONE:
            return timeStep.transition(self._game.game_state(), reward=1, discount=1.0)

        return timeStep.transition(self._game.game_state(), reward=-0.3, discount=1.0)
예제 #8
0
    def _step(self, action):
        if self._episode_ended:
            return self.reset()

        self._step_count += 1
        obs, reward, terminal = self._game.step(action)
        obs = obs.flatten()

        if terminal:
            self._episode_ended = True

        self._reward_count += reward
        # Stop if we have gotten 1000 treats
        if self._reward_count >= 1000:
            return ts.termination(obs, reward)

        # Reset how long we have to live if we get a treat
        if reward != 0:
            self._step_count = 0

        if self._step_limit is not None and self._step_count > self._step_limit:
            self._episode_ended = True

        if self._episode_ended:
            return ts.termination(obs, reward)

        return ts.transition(obs, reward, discount=1.0)
예제 #9
0
    def _step(self, action):
        if self._game_state.winner != -1:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        action = int(action)
        player = self._game_state.players[self._game_state.player_to_move]

        if self._lose_on_illegal_move:
            if action not in player.hand:
                next_obs = self._get_observation()
                return ts.termination(next_obs, reward=-1.0)
        elif action not in player.hand:
            action = random.choice(list(player.hand.keys()))

        self._action_player_controller.set_action(action)
        game_over = self._game_state.step()
        while self._game_state.player_to_move != self._agent_player and game_over == -1:
            game_over = self._game_state.step()
        next_obs = self._get_observation()
        if game_over != -1:
            if game_over == 0:
                reward = 0.0
            elif game_over - 1 == self._agent_player:
                reward = 1.0
            elif game_over - 1 == 1 - self._agent_player:
                reward = -1.0
            else:
                assert False
            return ts.termination(next_obs, reward=reward)

        return ts.transition(next_obs, reward=0.0)
예제 #10
0
    def _step(self, action):
        """Apply action and return new time_step."""
        if self._hit_count == self._plane_size:
            self._episode_ended = True
            return self.reset()

        if self._strike_count + 1 == self._max_steps:
            self.reset()
            return ts.termination(
                np.array(self._visible_board, dtype=np.float32),
                UNFINISHED_GAME_REWARD)

        self._strike_count += 1
        action_x = action // self._board_size
        action_y = action % self._board_size
        # Hit
        if self._hidden_board[action_x][
                action_y] == HIDDEN_BOARD_CELL_OCCUPIED:
            # Non-repeat move
            if self._visible_board[action_x][
                    action_y] == VISIBLE_BOARD_CELL_UNTRIED:
                self._hit_count += 1
                self._visible_board[action_x][
                    action_y] = VISIBLE_BOARD_CELL_HIT
                # Successful strike
                if self._hit_count == self._plane_size:
                    # Game finished
                    self._episode_ended = True
                    return ts.termination(
                        np.array(self._visible_board, dtype=np.float32),
                        FINISHED_GAME_REWARD,
                    )
                else:
                    self._episode_ended = False
                    return ts.transition(
                        np.array(self._visible_board, dtype=np.float32),
                        HIT_REWARD,
                        self._discount,
                    )
            # Repeat strike
            else:
                self._episode_ended = False
                return ts.transition(
                    np.array(self._visible_board, dtype=np.float32),
                    REPEAT_STRIKE_REWARD,
                    self._discount,
                )
        # Miss
        else:
            # Unsuccessful strike
            self._episode_ended = False
            self._visible_board[action_x][action_y] = VISIBLE_BOARD_CELL_MISS
            return ts.transition(
                np.array(self._visible_board, dtype=np.float32),
                MISS_REWARD,
                self._discount,
            )
예제 #11
0
    def _step(self, action):
        self.time_stamp += 1
        if self.time_stamp >= self.max_stamp:
            self._episode_ended = True
        else:
            self._episode_ended = False

        if self._episode_ended:
            self.max_fidelity = 0
            return ts.termination(np.array([0, 0, 0, 0, 0, 0, 0], dtype=np.float32), 0)

        H, c_ops = self.get_hamiltonian(action)

        # newly added

        c_ops = []
        # print(self.gamma)
        if self.gamma[0] > 0.0:
            H += np.sqrt(self.gamma[0]) * sigmam()

        if self.gamma[1] > 0.0:
            H += np.sqrt(self.gamma[1]) * sigmaz()

        ######

        t = np.arange(0, self.get_interval_width(), .01)
        transition_state = mesolve(H, self._state, t, c_ops=c_ops)
        self._state = transition_state.states[-1]
        new_fidelity = self.get_transition_fidelity(self._state)

        #reward = 2*new_fidelity - self.fidelity - self.max_fidelity
        #reward = reward if reward > 0 else 0
        reward = new_fidelity - self.fidelity

        self.fidelity = new_fidelity
        self.max_fidelity = new_fidelity if new_fidelity > self.max_fidelity else self.max_fidelity

        observation = [self.fidelity,
                       expect(sigmax(), self._state),
                       expect(sigmay(), self._state),
                       expect(sigmaz(), self._state),
                       reward,
                       action[0],
                       action[1]
                       ]

        if (self.fidelity > 0.9995 or self.time_stamp >= self.max_stamp):
            self._episode_ended = True
            self.max_fidelity = 0
            self.fidelity = 0
            #self.gamma = [random.uniform(0, 1), random.uniform(0, 1)]
            return ts.termination(np.array(observation, dtype=np.float32), reward=reward)

        else:
            return ts.transition(
                np.array(observation, dtype=np.float32), reward=reward, discount=0.9)
예제 #12
0
파일: dyna_ppo.py 프로젝트: samsinai/FLEXS
    def _step(self, action):
        """Progress the agent one step in the environment.

        The agent moves until the reward is decreasing. The number of sequences that
        can be evaluated at each episode is capped to `self.max_num_steps`.
        """
        # if we've exceeded the maximum number of steps, terminate
        if self.num_steps >= self.max_num_steps:
            return ts.termination(self._state, 0)

        # `action` is an integer representing which residue to mutate to 1
        # along the flattened one-hot representation of the sequence
        pos = action // len(self.alphabet)
        res = action % len(self.alphabet)
        self.num_steps += 1

        # if we are trying to modify the sequence with a no-op, then stop
        if self._state["sequence"][pos, res] == 1:
            return ts.termination(self._state, 0)

        self._state["sequence"][pos] = 0
        self._state["sequence"][pos, res] = 1
        state_string = s_utils.one_hot_to_string(self._state["sequence"],
                                                 self.alphabet)

        if self.fitness_model_is_gt:
            self._state["fitness"] = self.landscape.get_fitness(
                [state_string]).astype(np.float32)
        else:
            self._state["fitness"] = self.model.get_fitness(
                [state_string]).astype(np.float32)
        self.all_seqs[state_string] = self._state["fitness"].item()

        reward = self._state["fitness"].item(
        ) - self.lam * self.sequence_density(state_string)

        # if we have seen the sequence this episode,
        # terminate episode and punish
        # (to prevent going in loops)
        if state_string in self.episode_seqs:
            return ts.termination(self._state, -1)
        self.episode_seqs.add(state_string)

        # if the reward is not increasing, then terminate
        if reward < self.previous_fitness:
            return ts.termination(self._state, reward=reward)

        self.previous_fitness = reward
        return ts.transition(self._state, reward=reward)
예제 #13
0
    def _step(self, action):

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        row = self.find_row(action)
        # Make sure it is a valid move
        if row is None:
            self._episode_ended = True
            return ts.termination(np.array(self._state1, dtype=np.int32),
                                  reward=-10)

        # Update the boards
        win, full = self.update(self.IDENTIFIER, self.OPPOSITE_IDENTIFIER,
                                action, row)
        if win:
            return ts.termination(np.array(self._state1, dtype=np.int32),
                                  reward=1)
        elif full:
            return ts.termination(np.array(self._state1, dtype=np.int32),
                                  reward=0.5)

        # Make a second action
        agent2_time_step = self._tf_time_step()

        if self.agent2_policy is None:
            raise Exception('Error!, agent2 dose not exists')

        action = self.agent2_policy.action(agent2_time_step).action

        # Update the boards
        win, full = self.update(self.OPPOSITE_IDENTIFIER, self.IDENTIFIER,
                                action)
        if win:
            return ts.termination(np.array(self._state1, dtype=np.int32),
                                  reward=-1)
        elif full:
            return ts.termination(np.array(self._state1, dtype=np.int32),
                                  reward=0.5)

        time_step = ts.transition(np.array(self._state1, dtype=np.int32),
                                  reward=0.01,
                                  discount=1.0)
        # print("temp:", temp_time_step)
        # print("tf_temp:", tf_temp_time_step)
        # print("real:", time_step)
        return time_step
예제 #14
0
    def _step(self, action):
        if self._episode_ended:
            return self.reset()

        if action == 1:
            self._n_queries += 1
            self.reward = -0.1
        elif action == 0:
            # do not query
            pass
        else:
            raise ValueError('action should be 0 or 1.')

        image = self.dm.get_x_unl(
            self._counter)  # !TODO change to iterable object
        # prepare next observation (aka next state)
        final_output, intermediate_output = self.aux_model.predict(
            np.array([image]))
        self._state = np.append(intermediate_output, final_output)
        print('STATE:', self._state.shape)
        self._counter += 1
        # end of episode
        if self._n_queries >= MAX_QUERIES or self._counter >= len(
                self.dm.x_unl):
            self.reward = 1.0  # !TODO
            return ts.termination(np.array(self._state, dtype=np.int32),
                                  self.reward)
        else:
            self.reward = 0.5
            return ts.transition(np.array(self._state, dtype=np.float32),
                                 reward=self.reward,
                                 discount=1.0)
예제 #15
0
    def _step(self, action_input):
        if self._reset_next_step:
            return self.reset()

        
        action = action_input.copy()
        
        self._curr_step += 1
        new_state = deepcopy(self._state)
        noise = np.dot(self._ValFct.alphaAtZ(self._wealth), np.random.normal(scale=1./252,size=self._dRiskyAsset))/(self._wealth/self._scale)
        new_state += noise
        new_state += action
        
        if  not array_spec.check_arrays_nest(new_state, self._observation_spec):
            reward_step = -2
        else:
            self._state = new_state
            reward_step = self._reward_fn(action*self._wealth/self._scale)
            
            self._avg_reward += self._learningRate_AR*(reward_step-self._avg_reward)
            reward_step -= self._avg_reward
            reward_step = np.clip(self._reward_scale*reward_step, -1., 1.)
        
        if self._curr_step >= self._Nsteps:
            self._reset_next_step = True
            print('EPISODE AVERAGE REWARD: ', self._avg_reward)
            return ts.termination(observation=self._state, reward=reward_step)

        return ts.transition(observation=self._state, reward=reward_step, discount=1.)
예제 #16
0
    def _step(self, action):
        '''Step function, can be called with env._step()'''
        # TODO: complete implementation of flood logic

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        # Make sure episodes don't go on forever.
        self._state.current_color = action
        self._state.owned_blocks += np.random.randint(
            1, self._state.width * self._state.height -
            self._state.owned_blocks + 1)
        self._state.num_turns += 1

        if self._episode_ended or self._state.owned_blocks >= self._state.height * self._state.width:
            reward = -1 * self._state.num_turns
            return ts.termination(
                np.array(self._state.state_array, dtype=np.int32), reward)
        else:
            return ts.transition(np.array(self._state.state_array,
                                          dtype=np.int32),
                                 reward=0.0,
                                 discount=1)
예제 #17
0
파일: dyna_ppo.py 프로젝트: samsinai/FLEXS
    def _step(self, actions):
        """Progress the agent one step in the environment."""
        actions = actions.flatten()
        self.states[:, self.partial_seq_len, -1] = 0
        self.states[np.arange(self._batch_size), self.partial_seq_len,
                    actions] = 1
        self.partial_seq_len += 1

        # We have not generated the last residue in the sequence, so continue
        if self.partial_seq_len < self.seq_length - 1:
            return nest_utils.stack_nested_arrays(
                [ts.transition(seq_state, 0) for seq_state in self.states])

        # If sequence is of full length, score the sequence and end the episode
        # We need to take off the column in the matrix (-1) representing the mask token
        complete_sequences = [
            s_utils.one_hot_to_string(seq_state[:, :-1], self.alphabet)
            for seq_state in self.states
        ]
        if self.fitness_model_is_gt:
            fitnesses = self.landscape.get_fitness(complete_sequences)
        else:
            fitnesses = self.model.get_fitness(complete_sequences)
        self.all_seqs.update(zip(complete_sequences, fitnesses))

        # Reward = fitness - lambda * sequence density
        rewards = np.array([
            f - self.lam * self.sequence_density(seq)
            for seq, f in zip(complete_sequences, fitnesses)
        ])
        return nest_utils.stack_nested_arrays([
            ts.termination(seq_state, r)
            for seq_state, r in zip(self.states, rewards)
        ])
예제 #18
0
    def _step(self, action):
        if self._episode_ended:
            return self._reset()

        reward = 0

        if not (action == 1 or action == 0):
            raise ValueError('`action` should be 0 or 1.')

        self._state += 1
        self.observation += 1

        if action == 0:
            reward = 4 / self._time_between_squiggles_beats
        elif action == 1:
            if (self._state - 1) % self._time_between_squiggles_beats == 0:
                reward = 10
                if self.observation[0] == 1:
                    reward = 5
            else:
                reward = -4 / self._time_between_squiggles_beats

        if action == 1:
            # Clicking agent action to discrete squiggle hearing
            current_closeness_to_real_beat = self._state % self._time_between_squiggles_beats
            closest_beat = self._state - current_closeness_to_real_beat
            current_closeness_to_real_beat -= self._time_between_squiggles_beats

            current_i = int(
                (closest_beat / self._time_between_squiggles_beats) % 16)
            self._squiggles_input[current_i] = 1

        # Is it time for squiggles to get update?
        if self._state % (self._time_between_squiggles_beats * 16) == 0:
            self._squiggles_list[0].update_h(self._squiggles_input)
            for i in range(1, len(self._squiggles_list)):
                self._squiggles_list[i - 1].update_o()
                out_in = self._squiggles_list[i - 1].o
                self._squiggles_list[i].update_h(out_in)
            self._squiggles_list[-1].update_o()
            #print(self._squiggles_input)
            self._squiggles_input = [0 for i in range(16)]

        play = False
        if self._state % self._time_between_squiggles_beats == 0 and self._squiggles_list[
                -1].o[int((self._state / self._time_between_squiggles_beats) %
                          16)] == 1:
            play = True

        # Is it time for squig to play, and did it?
        if self._state%self._time_between_squiggles_beats == 0 and \
            self._squiggles_list[-1].o[int((self._state/self._time_between_squiggles_beats)%16)] == 1:
            self.observation[1:] = self.observation[:-1]
            self.observation[0] = 0

        if self._state >= self._states_until_termination:
            self._episode_ended = True
            return ts.termination(self.observation, reward)

        return ts.transition(self.observation, reward=reward, discount=0.9)
예제 #19
0
  def testAutomaticReset(self):
    observation_spec = tensor_spec.TensorSpec([1], tf.float32)
    action_spec = tensor_spec.TensorSpec([1], tf.float32)
    policy_state_spec = tensor_spec.TensorSpec([1], tf.float32)
    time_step_spec = ts.time_step_spec(observation_spec)

    policy = TfPassThroughPolicy(
        time_step_spec,
        action_spec,
        policy_state_spec=policy_state_spec,
        automatic_state_reset=True)

    observation = tf.constant(1, dtype=tf.float32, shape=(1, 1))
    reward = tf.constant(1, dtype=tf.float32, shape=(1,))
    time_step = tf.nest.map_structure(lambda *t: tf.concat(t, axis=0),
                                      ts.restart(observation, batch_size=1),
                                      ts.transition(observation, reward),
                                      ts.termination(observation, reward))

    state = self.evaluate(
        policy.action(time_step,
                      policy_state=policy.get_initial_state(3) + 1).state)

    self.assertEqual(0, state[0])
    self.assertEqual(1, state[1])
    self.assertEqual(1, state[2])

    state = self.evaluate(
        policy.distribution(
            time_step, policy_state=policy.get_initial_state(3) + 1).state)

    self.assertEqual(0, state[0])
    self.assertEqual(1, state[1])
    self.assertEqual(1, state[2])
예제 #20
0
    def _success(self, action):
        for index, l in enumerate(self.word_to_guess):
            if action == (ord(l) - 97):
                self._state[index] = action
        legal_moves = self._format_legal_moves(self.guessed_letters)
        observations_and_legal_moves = {
            'observations': np.array([self._state], dtype=np.float32),
            'legal_moves': legal_moves
        }

        # 26 correspond to not found
        if 26 not in self._state:
            self._episode_ended = True
            logging.debug(f"You Found {self.word_to_guess}")
            print(f"You Found {self.word_to_guess}")
            return ts.termination(
                observations_and_legal_moves,
                self.number_of_life * self.reward_map["game_success_reward"],
            )
        else:
            self.render()
            return ts.transition(
                observations_and_legal_moves,
                reward=self.reward_map["guess_success_reward"],
                discount=1.0,
            )
예제 #21
0
    def _step(self, action):
        try:
            prices = self.productsCosts * action[0]
            observation = np.round(
                (self.placeSize * self.productsUsualBuyingRates) *
                (self.productsPriceFlexibility**(
                    (self.productsUsualPrices - prices) /
                    self.productsUsualPrices)))

            marginPerProduct = (prices - self.productsCosts) * observation

            reward = marginPerProduct.sum()
            if self._state < 10:
                reward *= 1 + self._state * 0.1
            # convert to numpy array of float32, otherwise not accepted by specs
            observation = np.array(np.stack(
                (observation, np.zeros((self.size), dtype=np.float32))),
                                   dtype=np.float32)
            if self._state < self.duration:
                self._state += 1
                return ts.transition(observation, reward)
            else:
                return ts.termination(observation, reward)
        except Exception as e:
            logging.error(e)
            raise ("ERROR")
예제 #22
0
 def _get_ts_invalid_move(self, move_result: ResultOfAMove):
     reward = -20
     self._episode_ended = True
     return_ts = \
         ts.termination(self._get_observation(self._player_id),
                        reward=reward)
     return return_ts
예제 #23
0
  def _step(self, action):
    # Automatically reset the environments on step if they need to be reset.
    if self._handle_auto_reset and self._done:
      return self.reset()

    # Some environments (e.g. FrozenLake) use the action as a key to the
    # transition probability so it has to be hashable. In the case of discrete
    # actions we have a numpy scalar (e.g array(2)) which is not hashable
    # in this case, we simply pull out the scalar value which will be hashable.
    try:
      action = action.item() if self._action_is_discrete else action
    except AttributeError:
      action = action[0]  # Remove ListWrapper for single-agent compatibility

    observation, reward, self._done, self._info = self._gym_env.step(action)

    if self._match_obs_space_dtype:
      observation = self._to_obs_space_dtype(observation)

    reward = np.asarray(reward, dtype=self.reward_spec().dtype)
    outer_dims = nest_utils.get_outer_array_shape(reward, self.reward_spec())

    if self._done:
      return ts_lib.termination(observation, reward, outer_dims=outer_dims)
    else:
      return ts_lib.transition(observation, reward, self._discount,
                               outer_dims=outer_dims)
예제 #24
0
    def _step(self, action):
        # Automatically reset the environments on step if they need to be reset.
        if self._auto_reset and self._done:
            return self.reset_agent()

        action = action.item() if self._action_is_discrete else action

        observation, reward, self._done, self._info = self._gym_env.step(
            action)

        if self._match_obs_space_dtype:
            observation = self._to_obs_space_dtype(observation)

        reward = np.asarray(reward, dtype=self.reward_spec().dtype)
        outer_dims = nest_utils.get_outer_array_shape(reward,
                                                      self.reward_spec())

        if self._done:
            return ts_lib.termination(observation,
                                      reward,
                                      outer_dims=outer_dims)
        else:
            return ts_lib.transition(observation,
                                     reward,
                                     self._discount,
                                     outer_dims=outer_dims)
예제 #25
0
    def _step(self, action):
        try:
            if action.shape != self.productsCosts.shape:
                #action shape doesn't match action_spec. It matches observation_spec. Bug or usual for SAC agent?
                #print(action.shape)
                action = np.sum(action, axis=0)
            prices = self.productsCosts * action
            quantities = np.round(
                (self.placeSize * self.productsUsualBuyingRates) *
                (self.productsPriceFlexibility**(
                    (self.productsUsualPrices - prices) /
                    self.productsUsualPrices)))

            marginPerProduct = (prices - self.productsCosts) * quantities

            reward = marginPerProduct.sum()

            observation = np.stack(
                (self.productsCosts, self.productsUsualMarginRates,
                 self.productsUsualBuyingRates, self.productsUsualPrices,
                 self.productsPriceFlexibility, quantities))
            # convert to numpy array of float32, otherwise not accepted by specs
            observation = np.array(observation, dtype=np.float32)

            if self._state < self.duration:
                self._state += 1
                return ts.transition(observation, reward)
            else:
                return ts.termination(observation, reward)
        except:
            1 / 0
    def _step(self, action):
        if self.isEgoViolatedTraffic():
            # The last action ended the episode. Ignore the current action and start a new episode.
            return self._reset()

        self._send_command(action)
        # needed for ros msg to sync.
        self.rate.sleep()
        self.ego.step()

        if action[0] <= 0.5 or self.isEgoViolatedTraffic():
            reward = -1
            self.total_reward = self.total_reward + reward
        else:
            reward = action[0] / self._action_spec.maximum[0] - abs(
                self.last_steering - action[1])
            reward = reward / 1000.0
            self.last_steering = action[1]
            self.total_reward = self.total_reward + reward

        # return transition depending on game state
        # self.render()
        if self.isEgoViolatedTraffic():
            return time_step.termination(self.data_cam_front_rgb, reward)
        else:
            return time_step.transition(self.data_cam_front_rgb, reward)
    def _step(self, action):

        if self._episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self.reset()

        #check if the move is valid and reward accordingly
        if 0 <= action <= 8:
            if self.isSpotEmpty(action):
                self._grid = self.mark
                reward = self.calcReward()
                print("agent goes in spot {} for reward {}", action, reward)
            else:  #punish for picking a spot that has been picked
                reward = -10
            self._episode_ended = True
        else:
            raise ValueError('`action` should be 0 - 8.')

        if not self.isGridFull():
            self.takeOppTurn()
        else:
            self._episode_ended = True

        if self._episode_ended:
            return ts.termination(self._grid, reward)
        else:
            return ts.transition(self._grid, reward=2, discount=1.0)
예제 #28
0
  def _step(self, action):

    if self._episode_ended:
      return self._reset()

    if self.sim_time < 4.05 and self.sim_time > 4.03:
      reward = -np.square(self._state[0])*0.001
      self._episode_ended = True
    else:
      self._state, self.sim_time, self.s_road = Server(self.tcp_port).server_step(action)
      if self._state is None:
        return self._reset()
      reward = (self.s_road - self.s_road_old)
      #reward = self._state[0] * np.cos(self._state[3])
      self.s_road_old = self.s_road



    if self.time_counter > self.max_EpSteps:
      self._episode_ended = True

    self.time_counter += 1

    if self._episode_ended:
      return ts.termination(self._state, reward)
    else:
      return ts.transition(
          self._state, reward, discount=self.gamma)
예제 #29
0
    def _step(self, action):
        if self._action_spec:
            nest_utils.assert_same_structure(self._action_spec, action)

        self._num_steps += 1

        observation = self._get_observation()
        if self._num_steps < self._min_duration:
            self._done = False
        elif self._max_duration and self._num_steps >= self._max_duration:
            self._done = True
        else:
            self._done = self._rng.uniform() < self._episode_end_probability

        if self._done:
            reward = self._reward_fn(ts.StepType.LAST, action, observation)
            self._check_reward_shape(reward)
            time_step = ts.termination(observation, reward)
            self._num_steps = 0
        else:
            reward = self._reward_fn(ts.StepType.MID, action, observation)
            self._check_reward_shape(reward)
            time_step = ts.transition(observation, reward, self._discount)

        return time_step
예제 #30
0
    def _step(self, action: types.NestedArray) -> ts.TimeStep:
        if self._current_time_step.is_last():
            return self.reset()

        state_obs, rewards, _ = self._env.step(action)
        done = self._is_done()
        self._cumulative_rewards += rewards

        self._obs_stacker.add_observation(
            state_obs / 255)  # Normalizing obs in range [0, 1]
        stacked_obs = self._obs_stacker.get_observation_stack()

        obs = {
            'state_obs': stacked_obs,
            'utility_representation': self._utility_func.agent_utility_repr,
        }
        if self._cumulative_rewards_flag:
            obs['cumulative_rewards'] = self._cumulative_rewards

        # The scalar reward on which to train is equal to the delta in the utility between the
        # previous time step and the current one.
        current_utility = self._utility_func(self._cumulative_rewards)
        reward = current_utility - self._prev_step_utility
        self._prev_step_utility = current_utility

        if done:
            return ts.termination(obs, reward)
        else:
            return ts.transition(obs, reward, self.gamma)