def extract_state_and_reward_from_client_dict(
            self,
            client_dict: Dict[Any, Any],
            t: int
    ) -> Tuple[MdpState, Reward]:
        """
        Extract the state and reward from a client dict.

        :param client_dict: Client dictionary.
        :param t: Current time step.
        :return: 2-tuple of the state and reward.
        """

        QApplication.processEvents()

        # pull out robocode game events
        event_type_events: Dict[str, List[Dict]] = client_dict['events']

        # calculate number of turns that have passed since previous call to the current function. will be greater than
        # 1 any time the previous action took more than 1 turn to complete (e.g., moving long distances).
        if self.previous_state is None:
            turns_passed = 0
        else:
            turns_passed = client_dict['state']['time'] - self.previous_state.time

        # bullet power that hit self
        bullet_power_hit_self = sum([
            bullet_event['bullet']['power']
            for bullet_event in event_type_events.get('HitByBulletEvent', [])
        ])

        # sum up bullet power that hit others
        bullet_hit_events = event_type_events.get('BulletHitEvent', [])
        bullet_power_hit_others = sum([
            bullet_event['bullet']['power']
            for bullet_event in bullet_hit_events
        ])

        # sum up bullet power that missed others
        bullet_missed_events = event_type_events.get('BulletMissedEvent', [])
        bullet_power_missed_others = sum([
            bullet_event['bullet']['power']
            for bullet_event in bullet_missed_events
        ])

        # keep track of how much bullet power has missed the opponent since we last recorded a hit
        if self.previous_state is None:
            bullet_power_missed_others_since_previous_hit = 0.0
        elif bullet_power_hit_others > 0.0:
            bullet_power_missed_others_since_previous_hit = 0.0
        else:
            bullet_power_missed_others_since_previous_hit = self.previous_state.bullet_power_missed_others_since_previous_hit + bullet_power_missed_others

        # cumulative bullet power that has hit self, decaying over time.
        bullet_power_hit_self_cumulative = bullet_power_hit_self
        if self.previous_state is not None:
            bullet_power_hit_self_cumulative += self.previous_state.bullet_power_hit_self_cumulative * (self.bullet_power_decay ** turns_passed)

        # cumulative bullet power that has hit others, decaying over time.
        bullet_power_hit_others_cumulative = bullet_power_hit_others
        if self.previous_state is not None:
            bullet_power_hit_others_cumulative += self.previous_state.bullet_power_hit_others_cumulative * (self.bullet_power_decay ** turns_passed)

        # cumulative bullet power that has missed others, decaying over time.
        bullet_power_missed_others_cumulative = bullet_power_missed_others
        if self.previous_state is not None:
            bullet_power_missed_others_cumulative += self.previous_state.bullet_power_missed_others_cumulative * (self.bullet_power_decay ** turns_passed)

        # get most recent prior state that was at a different location than the current state. if there is no previous
        # state, then there is no such state.
        if self.previous_state is None:
            prior_state_different_location = None
        # if the previous state's location differs from the current location, then the previous state is what we want.
        elif self.previous_state.x != client_dict['state']['x'] or self.previous_state.y != client_dict['state']['y']:
            prior_state_different_location = self.previous_state
        # otherwise (if the previous and current state have the same location), then use the previous state's prior
        # state. this will be the case when we did something other than move on our previous turn.
        else:
            prior_state_different_location = self.previous_state.prior_state_different_location

        # most recent scanned enemy and how many turns ago it was scanned
        most_recent_scanned_robot = event_type_events.get('ScannedRobotEvent', [None])[-1]
        most_recent_scanned_robot_age_turns = None
        if most_recent_scanned_robot is None:
            if self.previous_state is not None and self.previous_state.most_recent_scanned_robot is not None:
                most_recent_scanned_robot = self.previous_state.most_recent_scanned_robot
                most_recent_scanned_robot_age_turns = self.previous_state.most_recent_scanned_robot_age_turns + turns_passed
        else:
            most_recent_scanned_robot_age_turns = 0

        # the round terminates either with death or victory
        dead = len(event_type_events.get('DeathEvent', [])) > 0
        won = len(event_type_events.get('WinEvent', [])) > 0
        terminal = dead or won

        state = RobocodeState(
            **client_dict['state'],
            bullet_power_hit_self=bullet_power_hit_self,
            bullet_power_hit_self_cumulative=bullet_power_hit_self_cumulative,
            bullet_power_hit_others=bullet_power_hit_others,
            bullet_power_hit_others_cumulative=bullet_power_hit_others_cumulative,
            bullet_power_missed_others=bullet_power_missed_others,
            bullet_power_missed_others_cumulative=bullet_power_missed_others_cumulative,
            bullet_power_missed_others_since_previous_hit=bullet_power_missed_others_since_previous_hit,
            events=event_type_events,
            previous_state=self.previous_state,
            prior_state_different_location=prior_state_different_location,
            most_recent_scanned_robot=most_recent_scanned_robot,
            most_recent_scanned_robot_age_turns=most_recent_scanned_robot_age_turns,
            AA=self.robot_actions,
            terminal=terminal
        )

        # calculate reward

        # store bullet firing events so that we can pull out information related to them at a later time step (e.g.,
        # when they hit or miss). add the rlai time step (t) to each event. there is a 1:1 between rlai time steps and
        # actions, but an action can extend over many robocode turns (e.g., movement). the bullet events have times
        # associated with them that correspond to robocode turns.
        self.bullet_id_fired_event.update({
            bullet_fired_event['bullet']['bulletId']: {
                **bullet_fired_event,
                'step': t
            }
            for bullet_fired_event in event_type_events.get('BulletFiredEvent', [])
        })

        # gun_reward = bullet_power_hit_others - bullet_power_missed_others
        # movement_reward = 1.0 if bullet_power_hit_self == 0.0 else -bullet_power_hit_self
        # total_reward = gun_reward + movement_reward
        # reward = RobocodeReward(
        #     i=None,
        #     r=total_reward,
        #     gun_reward=gun_reward,
        #     movement_reward=movement_reward,
        #     bullet_id_fired_event=self.bullet_id_fired_event,
        #     bullet_hit_events=bullet_hit_events,
        #     bullet_missed_events=bullet_missed_events
        # )

        # energy change reward...bullet firing will be penalized.
        reward = Reward(
            None,
            r=0.0 if self.previous_state is None else state.energy - self.previous_state.energy
        )

        # win/loss reward
        # reward = Reward(
        #     None,
        #     r=1.0 if won else -1.0 if dead else 0.0
        # )

        # hang on to the new state as the previous state, for the next call to extract.
        self.previous_state = state

        return state, reward
Пример #2
0
 def runSimulation(self):
     self.statusBar().showMessage("Simulating...")
     # update GUI to show changes status bar message
     QApplication.processEvents()
     self.simulate()
     self.statusBar().showMessage("Ready.")
Пример #3
0
    def advance(
            self,
            state: MdpState,
            t: int,
            a: Action,
            agent: MdpAgent
    ) -> Tuple[MdpState, Reward]:
        """
        Advance the state.

        :param state: State to advance.
        :param t: Time step.
        :param a: Action.
        :param agent: Agent.
        :return: 2-tuple of next state and reward.
        """

        # map discretized actions back to continuous space
        if isinstance(a, DiscretizedAction):
            gym_action = a.continuous_value
        # use continuous action values (which are vectors) directly
        elif isinstance(a, ContinuousMultiDimensionalAction):
            gym_action = a.value
        # use discretized action indices
        else:
            gym_action = a.i

        # fuel-based modification for continuous environments. cap energy expenditure at remaining fuel levels.
        fuel_used = None
        if self.gym_id == Gym.LLC_V2:
            main_throttle, side_throttle = gym_action[:]
            required_main_fuel = Gym.LLC_V2_FUEL_CONSUMPTION_FULL_THROTTLE_MAIN * (0.5 + 0.5 * main_throttle if main_throttle >= 0.0 else 0.0)
            required_side_fuel = Gym.LLC_V2_FUEL_CONSUMPTION_FULL_THROTTLE_SIDE * abs(side_throttle) if abs(side_throttle) >= 0.5 else 0.0
            required_total_fuel = required_main_fuel + required_side_fuel
            fuel_level = state.observation[-1]
            if required_total_fuel > fuel_level:  # pragma no cover
                gym_action[:] *= fuel_level / required_total_fuel
                fuel_used = fuel_level
            else:
                fuel_used = required_total_fuel

        elif self.gym_id == Gym.MCC_V0:
            throttle = gym_action[0]
            required_fuel = Gym.MCC_V0_FUEL_CONSUMPTION_FULL_THROTTLE * abs(throttle)
            fuel_level = state.observation[-1]
            if required_fuel > fuel_level:  # pragma no cover
                gym_action[:] *= fuel_level / required_fuel
                fuel_used = fuel_level
            else:
                fuel_used = required_fuel

        observation, reward, done, _ = self.gym_native.step(action=gym_action)

        # update fuel remaining if needed
        fuel_remaining = None
        if fuel_used is not None:
            fuel_remaining = max(0.0, state.observation[-1] - fuel_used)
            observation = np.append(observation, fuel_remaining)

        if self.gym_id == Gym.LLC_V2:

            reward = 0.0

            if done:

                # the ideal state is zeros across position/movement
                state_reward = -np.abs(observation[0:6]).sum()

                # reward for remaining fuel, but only if the state is good. rewarding for remaining fuel unconditionally
                # can cause the agent to veer out of bounds immediately and thus sacrifice state reward for fuel reward.
                # the terminating state is considered good if the lander is within the goal posts (which are at
                # x = +/-0.2) and the other orientation variables (y position, x and y velocity, angle and angular
                # velocity) are near zero. permit a small amount of lenience in the latter, since it's common for a
                # couple of the variables to be slightly positive even when the lander is sitting stationary on a flat
                # surface.
                fuel_reward = 0.0
                if abs(observation[0]) <= 0.2 and np.abs(observation[1:6]).sum() < 0.01:  # pragma no cover
                    fuel_reward = state.observation[-1]

                reward = state_reward + fuel_reward

        elif self.gym_id == Gym.MCC_V0:

            reward = 0.0

            # calculate fraction to goal state
            curr_distance = observation[0] - Gym.MCC_V0_TROUGH_X_POS
            goal_distance = self.mcc_curr_goal_x_pos - Gym.MCC_V0_TROUGH_X_POS
            fraction_to_goal = curr_distance / goal_distance
            if fraction_to_goal >= 1.0:

                # increment goal up to the final goal
                self.mcc_curr_goal_x_pos = min(Gym.MCC_V0_GOAL_X_POS, self.mcc_curr_goal_x_pos + 0.05)

                # mark state and stats recorder as done. must manually mark stats recorder to allow premature reset.
                done = True
                if hasattr(self.gym_native, 'stats_recorder'):
                    self.gym_native.stats_recorder.done = done

                reward = curr_distance + fuel_remaining

        # call render if rendering manually
        if self.check_render_current_episode(True):
            self.gym_native.render()

        if self.check_render_current_episode(None):

            # sleep if we're restricting steps per second
            if self.steps_per_second is not None:
                sleep(1.0 / self.steps_per_second)

            if self.plot_environment:
                self.state_reward_scatter_plot.update(np.append(observation, reward))

            # swimmer is a non-qt environment, so we need to process qt events manually.
            if self.gym_id == Gym.SWIMMER_V2:
                QApplication.processEvents()

        self.state = GymState(
            environment=self,
            observation=observation,
            terminal=done,
            agent=agent
        )

        self.previous_observation = observation

        return self.state, Reward(i=None, r=reward)