def extract_state_and_reward_from_client_dict( self, client_dict: Dict[Any, Any], t: int ) -> Tuple[MdpState, Reward]: """ Extract the state and reward from a client dict. :param client_dict: Client dictionary. :param t: Current time step. :return: 2-tuple of the state and reward. """ QApplication.processEvents() # pull out robocode game events event_type_events: Dict[str, List[Dict]] = client_dict['events'] # calculate number of turns that have passed since previous call to the current function. will be greater than # 1 any time the previous action took more than 1 turn to complete (e.g., moving long distances). if self.previous_state is None: turns_passed = 0 else: turns_passed = client_dict['state']['time'] - self.previous_state.time # bullet power that hit self bullet_power_hit_self = sum([ bullet_event['bullet']['power'] for bullet_event in event_type_events.get('HitByBulletEvent', []) ]) # sum up bullet power that hit others bullet_hit_events = event_type_events.get('BulletHitEvent', []) bullet_power_hit_others = sum([ bullet_event['bullet']['power'] for bullet_event in bullet_hit_events ]) # sum up bullet power that missed others bullet_missed_events = event_type_events.get('BulletMissedEvent', []) bullet_power_missed_others = sum([ bullet_event['bullet']['power'] for bullet_event in bullet_missed_events ]) # keep track of how much bullet power has missed the opponent since we last recorded a hit if self.previous_state is None: bullet_power_missed_others_since_previous_hit = 0.0 elif bullet_power_hit_others > 0.0: bullet_power_missed_others_since_previous_hit = 0.0 else: bullet_power_missed_others_since_previous_hit = self.previous_state.bullet_power_missed_others_since_previous_hit + bullet_power_missed_others # cumulative bullet power that has hit self, decaying over time. bullet_power_hit_self_cumulative = bullet_power_hit_self if self.previous_state is not None: bullet_power_hit_self_cumulative += self.previous_state.bullet_power_hit_self_cumulative * (self.bullet_power_decay ** turns_passed) # cumulative bullet power that has hit others, decaying over time. bullet_power_hit_others_cumulative = bullet_power_hit_others if self.previous_state is not None: bullet_power_hit_others_cumulative += self.previous_state.bullet_power_hit_others_cumulative * (self.bullet_power_decay ** turns_passed) # cumulative bullet power that has missed others, decaying over time. bullet_power_missed_others_cumulative = bullet_power_missed_others if self.previous_state is not None: bullet_power_missed_others_cumulative += self.previous_state.bullet_power_missed_others_cumulative * (self.bullet_power_decay ** turns_passed) # get most recent prior state that was at a different location than the current state. if there is no previous # state, then there is no such state. if self.previous_state is None: prior_state_different_location = None # if the previous state's location differs from the current location, then the previous state is what we want. elif self.previous_state.x != client_dict['state']['x'] or self.previous_state.y != client_dict['state']['y']: prior_state_different_location = self.previous_state # otherwise (if the previous and current state have the same location), then use the previous state's prior # state. this will be the case when we did something other than move on our previous turn. else: prior_state_different_location = self.previous_state.prior_state_different_location # most recent scanned enemy and how many turns ago it was scanned most_recent_scanned_robot = event_type_events.get('ScannedRobotEvent', [None])[-1] most_recent_scanned_robot_age_turns = None if most_recent_scanned_robot is None: if self.previous_state is not None and self.previous_state.most_recent_scanned_robot is not None: most_recent_scanned_robot = self.previous_state.most_recent_scanned_robot most_recent_scanned_robot_age_turns = self.previous_state.most_recent_scanned_robot_age_turns + turns_passed else: most_recent_scanned_robot_age_turns = 0 # the round terminates either with death or victory dead = len(event_type_events.get('DeathEvent', [])) > 0 won = len(event_type_events.get('WinEvent', [])) > 0 terminal = dead or won state = RobocodeState( **client_dict['state'], bullet_power_hit_self=bullet_power_hit_self, bullet_power_hit_self_cumulative=bullet_power_hit_self_cumulative, bullet_power_hit_others=bullet_power_hit_others, bullet_power_hit_others_cumulative=bullet_power_hit_others_cumulative, bullet_power_missed_others=bullet_power_missed_others, bullet_power_missed_others_cumulative=bullet_power_missed_others_cumulative, bullet_power_missed_others_since_previous_hit=bullet_power_missed_others_since_previous_hit, events=event_type_events, previous_state=self.previous_state, prior_state_different_location=prior_state_different_location, most_recent_scanned_robot=most_recent_scanned_robot, most_recent_scanned_robot_age_turns=most_recent_scanned_robot_age_turns, AA=self.robot_actions, terminal=terminal ) # calculate reward # store bullet firing events so that we can pull out information related to them at a later time step (e.g., # when they hit or miss). add the rlai time step (t) to each event. there is a 1:1 between rlai time steps and # actions, but an action can extend over many robocode turns (e.g., movement). the bullet events have times # associated with them that correspond to robocode turns. self.bullet_id_fired_event.update({ bullet_fired_event['bullet']['bulletId']: { **bullet_fired_event, 'step': t } for bullet_fired_event in event_type_events.get('BulletFiredEvent', []) }) # gun_reward = bullet_power_hit_others - bullet_power_missed_others # movement_reward = 1.0 if bullet_power_hit_self == 0.0 else -bullet_power_hit_self # total_reward = gun_reward + movement_reward # reward = RobocodeReward( # i=None, # r=total_reward, # gun_reward=gun_reward, # movement_reward=movement_reward, # bullet_id_fired_event=self.bullet_id_fired_event, # bullet_hit_events=bullet_hit_events, # bullet_missed_events=bullet_missed_events # ) # energy change reward...bullet firing will be penalized. reward = Reward( None, r=0.0 if self.previous_state is None else state.energy - self.previous_state.energy ) # win/loss reward # reward = Reward( # None, # r=1.0 if won else -1.0 if dead else 0.0 # ) # hang on to the new state as the previous state, for the next call to extract. self.previous_state = state return state, reward
def runSimulation(self): self.statusBar().showMessage("Simulating...") # update GUI to show changes status bar message QApplication.processEvents() self.simulate() self.statusBar().showMessage("Ready.")
def advance( self, state: MdpState, t: int, a: Action, agent: MdpAgent ) -> Tuple[MdpState, Reward]: """ Advance the state. :param state: State to advance. :param t: Time step. :param a: Action. :param agent: Agent. :return: 2-tuple of next state and reward. """ # map discretized actions back to continuous space if isinstance(a, DiscretizedAction): gym_action = a.continuous_value # use continuous action values (which are vectors) directly elif isinstance(a, ContinuousMultiDimensionalAction): gym_action = a.value # use discretized action indices else: gym_action = a.i # fuel-based modification for continuous environments. cap energy expenditure at remaining fuel levels. fuel_used = None if self.gym_id == Gym.LLC_V2: main_throttle, side_throttle = gym_action[:] required_main_fuel = Gym.LLC_V2_FUEL_CONSUMPTION_FULL_THROTTLE_MAIN * (0.5 + 0.5 * main_throttle if main_throttle >= 0.0 else 0.0) required_side_fuel = Gym.LLC_V2_FUEL_CONSUMPTION_FULL_THROTTLE_SIDE * abs(side_throttle) if abs(side_throttle) >= 0.5 else 0.0 required_total_fuel = required_main_fuel + required_side_fuel fuel_level = state.observation[-1] if required_total_fuel > fuel_level: # pragma no cover gym_action[:] *= fuel_level / required_total_fuel fuel_used = fuel_level else: fuel_used = required_total_fuel elif self.gym_id == Gym.MCC_V0: throttle = gym_action[0] required_fuel = Gym.MCC_V0_FUEL_CONSUMPTION_FULL_THROTTLE * abs(throttle) fuel_level = state.observation[-1] if required_fuel > fuel_level: # pragma no cover gym_action[:] *= fuel_level / required_fuel fuel_used = fuel_level else: fuel_used = required_fuel observation, reward, done, _ = self.gym_native.step(action=gym_action) # update fuel remaining if needed fuel_remaining = None if fuel_used is not None: fuel_remaining = max(0.0, state.observation[-1] - fuel_used) observation = np.append(observation, fuel_remaining) if self.gym_id == Gym.LLC_V2: reward = 0.0 if done: # the ideal state is zeros across position/movement state_reward = -np.abs(observation[0:6]).sum() # reward for remaining fuel, but only if the state is good. rewarding for remaining fuel unconditionally # can cause the agent to veer out of bounds immediately and thus sacrifice state reward for fuel reward. # the terminating state is considered good if the lander is within the goal posts (which are at # x = +/-0.2) and the other orientation variables (y position, x and y velocity, angle and angular # velocity) are near zero. permit a small amount of lenience in the latter, since it's common for a # couple of the variables to be slightly positive even when the lander is sitting stationary on a flat # surface. fuel_reward = 0.0 if abs(observation[0]) <= 0.2 and np.abs(observation[1:6]).sum() < 0.01: # pragma no cover fuel_reward = state.observation[-1] reward = state_reward + fuel_reward elif self.gym_id == Gym.MCC_V0: reward = 0.0 # calculate fraction to goal state curr_distance = observation[0] - Gym.MCC_V0_TROUGH_X_POS goal_distance = self.mcc_curr_goal_x_pos - Gym.MCC_V0_TROUGH_X_POS fraction_to_goal = curr_distance / goal_distance if fraction_to_goal >= 1.0: # increment goal up to the final goal self.mcc_curr_goal_x_pos = min(Gym.MCC_V0_GOAL_X_POS, self.mcc_curr_goal_x_pos + 0.05) # mark state and stats recorder as done. must manually mark stats recorder to allow premature reset. done = True if hasattr(self.gym_native, 'stats_recorder'): self.gym_native.stats_recorder.done = done reward = curr_distance + fuel_remaining # call render if rendering manually if self.check_render_current_episode(True): self.gym_native.render() if self.check_render_current_episode(None): # sleep if we're restricting steps per second if self.steps_per_second is not None: sleep(1.0 / self.steps_per_second) if self.plot_environment: self.state_reward_scatter_plot.update(np.append(observation, reward)) # swimmer is a non-qt environment, so we need to process qt events manually. if self.gym_id == Gym.SWIMMER_V2: QApplication.processEvents() self.state = GymState( environment=self, observation=observation, terminal=done, agent=agent ) self.previous_observation = observation return self.state, Reward(i=None, r=reward)