class MatrixGameEnv(Env): def __init__(self, matrices, reward_perturbation=0, rand: th.Generator = th.default_generator): super().__init__() matrices = th.as_tensor(matrices) reward_perturbation = th.as_tensor(reward_perturbation) # Check shape of transition matrix n_agents = matrices.dim() - 1 if matrices.shape[0] != n_agents: raise ValueError( "Number of matrices does not match dimensions of each matrix") # Check shape of reward perturbation if reward_perturbation.shape != () and reward_perturbation.shape != ( n_agents, ): raise ValueError( "Reward perturbation must be either same or specified for each agent" ) # Check values of reward perturbation if (reward_perturbation < 0).any(): raise ValueError( "Values of reward perturbation must be non-negative") ## State space self.observation_space = Discrete(1) ## Action space self.action_space = MultiDiscrete(matrices.shape[1:]) ## Matrices of the matrix game self.matrices = matrices ## Standard deviation of reward perturbation self.reward_perturbation = reward_perturbation ## Random number generator self.rand = rand def reset(self): return th.tensor(0) def step(self, actions): # Check validity of joint actions if not self.action_space.contains(np.array(actions)): raise ValueError("Joint actions {} is invalid".format(actions)) # Rewards for each agent rewards = self.matrices[(slice(None), *actions)].clone() # Add random perturbation to rewards reward_perturbation = self.reward_perturbation if (reward_perturbation != 0).all(): rewards += th.normal(0., reward_perturbation, generator=self.rand) # Step result return th.tensor(0), rewards, True, {}
class AlgorithmicEnv(Env): metadata = {'render.modes': ['human', 'ansi']} # Only 'promote' the length of generated input strings if the worst of the # last n episodes was no more than this far from the maximum reward MIN_REWARD_SHORTFALL_FOR_PROMOTION = -1.0 def __init__(self, base=10, chars=False, starting_min_length=2): """ base: Number of distinct characters. chars: If True, use uppercase alphabet. Otherwise, digits. Only affects rendering. starting_min_length: Minimum input string length. Ramps up as episodes are consistently solved. """ self.base = base # Keep track of this many past episodes self.last = 10 # Cumulative reward earned this episode self.episode_total_reward = None # Running tally of reward shortfalls. e.g. if there were 10 points to earn and # we got 8, we'd append -2 AlgorithmicEnv.reward_shortfalls = [] if chars: self.charmap = [chr(ord('A')+i) for i in range(base)] else: self.charmap = [str(i) for i in range(base)] self.charmap.append(' ') # TODO: Not clear why this is a class variable rather than instance. # Could lead to some spooky action at a distance if someone is working # with multiple algorithmic envs at once. Also makes testing tricky. AlgorithmicEnv.min_length = starting_min_length # Three sub-actions: # 1. Move read head left or right (or up/down) # 2. Write or not # 3. Which character to write. (Ignored if should_write=0) self.action_space = MultiDiscrete( [len(self.MOVEMENTS), 2, self.base] ) # Can see just what is on the input tape (one of n characters, or nothing) self.observation_space = Discrete(self.base + 1) self.seed() self.reset() @classmethod def _movement_idx(kls, movement_name): return kls.MOVEMENTS.index(movement_name) def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] def _get_obs(self, pos=None): """Return an observation corresponding to the given read head position (or the current read head position, if none is given).""" raise NotImplemented def _get_str_obs(self, pos=None): ret = self._get_obs(pos) return self.charmap[ret] def _get_str_target(self, pos): """Return the ith character of the target string (or " " if index out of bounds).""" if pos < 0 or len(self.target) <= pos: return " " else: return self.charmap[self.target[pos]] def render_observation(self): """Return a string representation of the input tape/grid.""" raise NotImplementedError def render(self, mode='human'): outfile = StringIO() if mode == 'ansi' else sys.stdout inp = "Total length of input instance: %d, step: %d\n" % (self.input_width, self.time) outfile.write(inp) x, y, action = self.read_head_position, self.write_head_position, self.last_action if action is not None: inp_act, out_act, pred = action outfile.write("=" * (len(inp) - 1) + "\n") y_str = "Output Tape : " target_str = "Targets : " if action is not None: pred_str = self.charmap[pred] x_str = self.render_observation() for i in range(-2, len(self.target) + 2): target_str += self._get_str_target(i) if i < y - 1: y_str += self._get_str_target(i) elif i == (y - 1): if action is not None and out_act == 1: color = 'green' if pred == self.target[i] else 'red' y_str += colorize(pred_str, color, highlight=True) else: y_str += self._get_str_target(i) outfile.write(x_str) outfile.write(y_str + "\n") outfile.write(target_str + "\n\n") if action is not None: outfile.write("Current reward : %.3f\n" % self.last_reward) outfile.write("Cumulative reward : %.3f\n" % self.episode_total_reward) move = self.MOVEMENTS[inp_act] outfile.write("Action : Tuple(move over input: %s,\n" % move) out_act = out_act == 1 outfile.write(" write to the output tape: %s,\n" % out_act) outfile.write(" prediction: %s)\n" % pred_str) else: outfile.write("\n" * 5) if mode != 'human': with closing(outfile): return outfile.getvalue() @property def input_width(self): return len(self.input_data) def step(self, action): assert self.action_space.contains(action) self.last_action = action inp_act, out_act, pred = action done = False reward = 0.0 self.time += 1 assert 0 <= self.write_head_position if out_act == 1: try: correct = pred == self.target[self.write_head_position] except IndexError: logger.warn("It looks like you're calling step() even though this "+ "environment has already returned done=True. You should always call "+ "reset() once you receive done=True. Any further steps are undefined "+ "behaviour.") correct = False if correct: reward = 1.0 else: # Bail as soon as a wrong character is written to the tape reward = -0.5 done = True self.write_head_position += 1 if self.write_head_position >= len(self.target): done = True self._move(inp_act) if self.time > self.time_limit: reward = -1.0 done = True obs = self._get_obs() self.last_reward = reward self.episode_total_reward += reward return (obs, reward, done, {}) @property def time_limit(self): """If an agent takes more than this many timesteps, end the episode immediately and return a negative reward.""" # (Seemingly arbitrary) return self.input_width + len(self.target) + 4 def _check_levelup(self): """Called between episodes. Update our running record of episode rewards and, if appropriate, 'level up' minimum input length.""" if self.episode_total_reward is None: # This is before the first episode/call to reset(). Nothing to do return AlgorithmicEnv.reward_shortfalls.append(self.episode_total_reward - len(self.target)) AlgorithmicEnv.reward_shortfalls = AlgorithmicEnv.reward_shortfalls[-self.last:] if len(AlgorithmicEnv.reward_shortfalls) == self.last and \ min(AlgorithmicEnv.reward_shortfalls) >= self.MIN_REWARD_SHORTFALL_FOR_PROMOTION and \ AlgorithmicEnv.min_length < 30: AlgorithmicEnv.min_length += 1 AlgorithmicEnv.reward_shortfalls = [] def reset(self): self._check_levelup() self.last_action = None self.last_reward = 0 self.read_head_position = self.READ_HEAD_START self.write_head_position = 0 self.episode_total_reward = 0.0 self.time = 0 length = self.np_random.randint(3) + AlgorithmicEnv.min_length self.input_data = self.generate_input_data(length) self.target = self.target_from_input_data(self.input_data) return self._get_obs() def generate_input_data(self, size): raise NotImplemented def target_from_input_data(self, input_data): raise NotImplemented("Subclasses must implement") def _move(self, movement): raise NotImplemented
class FarmEnv(gym.Env): """ Description: A wind farm is controlled by yaw angles Observation: Type: Box(n+2) Num Observation Min Max 0 current yaw angle (°) -max_yaw max_yaw ... ... ... ... n current yaw angle (°) -max_yaw max_yaw n+1 wind angle (°) 0 359 n+2 wind speed (kts) min_wind_speed max_wind_speed Actions: Type: Discrete(2) Num Action Min Max 0 yaw angle (°) -max_yaw max_yaw ... n yaw angle (°) -max_yaw max_yaw Reward: Reward is power increase for every step taken, including the termination step Starting State: TBD Episode Termination: Pole Angle is more than 12 degrees. Cart Position is more than 2.4 (center of the cart reaches the edge of the display). Episode length is greater than 200. Solved Requirements: Considered solved when the average return is greater than or equal to 195.0 over 100 consecutive trials. """ def __init__(self, config): self.farm = config['farm'] self.numwt = config['num_wind_turbines'] # initialize yaw boundaries self.allowed_yaw = config["max_yaw"] self.min_wind_speed = config["min_wind_speed"] self.max_wind_speed = config["max_wind_speed"] self.min_wind_angle = config["min_wind_angle"] self.max_wind_angle = config["max_wind_angle"] self.continuous_action_space = config["continuous_action_space"] self.best_explored_power = {} self.count_steps = 0 self.initialized_yaw_angle = 0 self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32) self.turbine_powers = np.full((self.numwt, ), 0, dtype=np.float64) self.turbulent_intensities = np.full((self.numwt, ), 0, dtype=np.float64) self.thrust_coefs = np.full((self.numwt, ), 0, dtype=np.float64) self.wt_speed_u = np.full((self.numwt, ), 0, dtype=np.float64) self.wt_speed_v = np.full((self.numwt, ), 0, dtype=np.float64) self.cur_wind_speed = [8.] # in kts self.cur_wind_angle = [270.] # in degrees self.initial_wind_angle = 0 # in kts self.max_wind_direction_variation = 10, # max wind angle variation during episode self.cur_nominal_power = 0 self.cur_power = 0 self.cur_power_ratio = 0 self.cur_nominal_ti_sum = 0 if self.continuous_action_space: # action space is the yaw angles for the wt , to be multiplied by allowed_yaw° action_low = np.full((self.numwt, ), -1., dtype=np.float32) action_high = np.full((self.numwt, ), 1., dtype=np.float32) self.action_space = Box(low=action_low, high=action_high, shape=(self.numwt, )) else: # discrete action space self.action_space = MultiDiscrete( np.full((self.numwt, ), 2 * self.allowed_yaw, dtype=np.float32)) print(f'action space : {self.action_space}') print(f'action space : {self.action_space.sample()}') # observation space TODO observation_high = np.concatenate( ( # np.array([self.max_wind_angle, self.max_wind_speed])), np.array([1.] * self.numwt ), # yaw max positions for all wind turbines # np.array([1] * self.numwt), # x axis wind speed for all wind turbines # np.array([1] * self.numwt), # y axis wind speed for all wind turbines np.array([1] * self.numwt ), # max turbulence intensity for all wind turbines np.array([1]), # max power ratio # np.array([1] * self.numwt), # max thrust coef for all wind turbines # np.array([1] * self.numwt), # max power coef for all wind turbines np.array([1]), # max sinus wind angle np.array([1]), # max cosinus wind angle np.array( [1])), # max normalized wind speed (range 2 to 25.5 m.s-1) axis=0) observation_low = np.concatenate( ( # np.array([self.min_wind_angle, self.min_wind_speed])), np.array([-1] * self.numwt ), # yaw min positions for all wind turbines # np.array([-1] * self.numwt), # x axis wind speed for all wind turbines # np.array([-1] * self.numwt), # y axis wind speed for all wind turbines np.array([0.] * self.numwt ), # min turbulence intensity for all wind turbines np.array([-1]), # min power ratio # np.array([0] * self.numwt), # min thrust coef for all wind turbines # np.array([0] * self.numwt), # min power coef for all wind turbines np.array([-1]), # min sinus wind angle np.array([-1]), # min cosinus wind angle np.array( [0])), # min normalized wind speed (range 2 to 25.5 m.s-1) axis=0) print(f'observation low : {observation_low}') print(f'observation high : {observation_high}') self.observation_space = Box(low=observation_low, high=observation_high, shape=(self.numwt * 2 + 4, ), dtype=np.float64) print(f'observation space : {self.observation_space}') def reset(self, wd=None, ws=None): self.count_steps = 0 self.cur_yaws = np.full((self.numwt, ), 0, dtype=np.int32) # Define wind conditions for this episode # check wind speed in range (2 to 25,5) assert self.max_wind_speed < 25.5, "max wind speed too high" assert self.min_wind_speed > 2., "min wind speed too low" if wd: self.cur_wind_angle = wd else: self.cur_wind_angle = np.random.randint(self.min_wind_angle, self.max_wind_angle) if ws: self.cur_wind_speed = ws else: self.cur_wind_speed = np.random.uniform(self.min_wind_speed, self.max_wind_speed) self.initial_wind_angle = self.cur_wind_angle # Update the flow in the model print(f'wind angle {self.cur_wind_angle}') print(f'wind speed {self.cur_wind_speed}') self.farm.reinitialize_flow_field(wind_direction=[self.cur_wind_angle], wind_speed=[self.cur_wind_speed]) self.farm.calculate_wake() self.cur_nominal_power = self.farm.get_farm_power() self.best_explored_power[self.cur_wind_angle] = self.cur_nominal_power self.cur_nominal_ti_sum = np.sum(self.farm.get_turbine_ti()) state = self.get_observation() # print(f'initial state is {state}') # print(f'observation space is {self.observation_space}') return state # return current state of the environment def get_observation(self): self.turbulent_intensities = (np.array(self.farm.get_turbine_ti()) - 0.055) / 0.07 #rescaling self.cur_power = self.farm.get_farm_power() # self.thrust_coefs = self.farm.get_turbine_ct() # # turbine_powers = self.farm.get_turbine_power() # self.turbine_powers = turbine_powers / np.max(turbine_powers) # # wind_speed_points_at_wt = pd.DataFrame(self.farm.get_set_of_points(self.farm_layout[0], self.farm_layout[1], [80.] * self.numwt).head(self.numwt)) # u_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.u) # v_wind_speed_points_at_wt = np.array(wind_speed_points_at_wt.v) # self.wt_speed_u = u_wind_speed_points_at_wt / self.cur_wind_speed # self.wt_speed_v = v_wind_speed_points_at_wt / self.cur_wind_speed current_yaws = self.cur_yaws / self.allowed_yaw self.cur_power_ratio = ( self.cur_power - self.cur_nominal_power) / self.cur_nominal_power observation = np.concatenate( ( # self.wt_speed_u, # self.wt_speed_v, current_yaws, self.turbulent_intensities, # self.thrust_coefs, # self.turbine_powers, np.array([self.cur_power_ratio]), np.array([sind(self.cur_wind_angle)]), np.array([cosd(self.cur_wind_angle)]), np.array([self.cur_wind_speed / 25.5]), ), axis=0) return observation def step(self, action, no_variation=False): # check actions validity err_msg = "%r (%s) invalid" % (action, type(action)) assert self.action_space.contains(action), err_msg # Execute the actions if self.continuous_action_space: self.cur_yaws = action * self.allowed_yaw else: self.cur_yaws = action - self.allowed_yaw print(f'current yaws {self.cur_yaws}') if not no_variation: # Apply wind variation if self.cur_wind_angle <= self.initial_wind_angle + self.max_wind_direction_variation[ 0] or self.cur_wind_angle >= self.initial_wind_angle - self.max_wind_direction_variation[ 0]: self.cur_wind_angle = self.cur_wind_angle + np.random.randint( -1, 2) self.farm.reinitialize_flow_field( wind_direction=[self.cur_wind_angle], wind_speed=[self.cur_wind_speed]) print(f'new {self.cur_wind_angle}') self.farm.calculate_wake() self.cur_nominal_power = self.farm.get_farm_power() # Get the Observations from the simulation self.farm.calculate_wake(yaw_angles=self.cur_yaws) observation = self.get_observation() # check observation err_msg = "%r (%s) invalid" % (observation, type(observation)) assert self.observation_space.contains(observation), err_msg # reward calc # power_ratio = (self.cur_power - self.best_explored_power[self.cur_wind_angle]) / self.best_explored_power[self.cur_wind_angle] reward = self.cur_power_ratio * 100 print(f'power ratio {self.cur_power_ratio}') # if self.cur_power > self.best_explored_power[self.cur_wind_angle]: # self.best_explored_power[self.cur_wind_angle] = self.cur_power self.count_steps += 1 # Done Evaluation if self.count_steps == 30: done = True else: done = False return observation, reward, done, {}
class RandomMDPEnv(Env): # Reward correlation scale _REWARD_CORRELATION_SCALE = 10 def __init__(self, n_states: int, n_actions: Union[int, Tuple[int]], n_agents: Optional[int] = None, acyclic: bool = False, reward_correlation=None, reward_perturbation=0, rand: th.Generator = th.default_generator): super().__init__() # All agents have same number of actions if mu.isscalar(n_actions): # Number of agents must be given if n_agents is None: raise ValueError( "Number of agents must be given when number of actions is scalar" ) n_actions = (n_actions, ) * n_agents # Check size of number of actions array n_actions_size = len(n_actions) if n_actions_size != n_agents: raise ValueError( "Expect {} number of actions for each agent, got {}".format( n_agents, n_actions_size)) # Rewards of different agents have no correlation by default reward_correlation = th.as_tensor(reward_correlation) if reward_correlation is None: reward_correlation = self._REWARD_CORRELATION_SCALE * th.eye( n_agents) # Check shape of the correlation matrix elif reward_correlation.shape != (n_agents, n_agents): raise ValueError( "Rewards correlation matrix must be a {}*{} square matrix". format(n_agents, n_agents)) # Full shape and allowed dimensions of reward perturbation perturbation_full_shape = (n_states, *n_actions, n_states, n_agents) perturbation_allowed_dims = [ 0, 1, n_agents + 1, n_agents + 2, n_agents + 3 ] # Check shape and dimensions of the reward perturbation reward_perturbation = th.as_tensor(reward_perturbation) perturbation_shape = reward_perturbation.shape perturbation_dims = len(perturbation_shape) if perturbation_dims not in perturbation_allowed_dims: raise ValueError( "Expect reward perturbation tensor of {} dimensions, got {}". format(perturbation_allowed_dims, perturbation_dims)) if perturbation_full_shape[:perturbation_dims] != perturbation_shape: raise ValueError( "Expect reward perturbation tensor with shape {}, got {}". format(perturbation_full_shape[:perturbation_dims], perturbation_shape)) # Check values of reward perturbation if (reward_perturbation < 0).any(): raise ValueError( "Values of reward perturbation must be non-negative") ## State space self.observation_space = Discrete(n_states) ## Joint action space self.action_space = MultiDiscrete(n_actions) ## Reward correlation matrix self.reward_correlation = reward_correlation ## Reward perturbation self.reward_perturbation = reward_perturbation ## Acyclic MDP self.acyclic = acyclic ## Random number generator self.rand = rand # Make multi-agent MDP environment self._make_ma_mdp() # Initialize environment self.reset() @property def _done(self): # Game is done if MDP is acyclic and last state is reached return self.acyclic and self._state == self.n_states - 1 def _make_ma_mdp(self): joint_action_shape = self.joint_action_shape n_states = self.n_states n_agents = len(joint_action_shape) rand = self.rand # Reward perturbation perturbation = mu.unsqueeze( self.reward_perturbation, -1, n_states + 3 - self.reward_perturbation.dim()) # Generate transition probability tensor trans_prob = th.rand(n_states, *joint_action_shape, n_states, generator=rand) # Acyclic (episodic) MDP if self.acyclic: states_idx, next_states_idx = th.tril_indices(n_states) trans_prob[states_idx, ..., next_states_idx] = 0 # Normalize transition probability matrix trans_prob /= trans_prob.sum(dim=-1, keepdim=True) trans_prob[th.isnan(trans_prob)] = 0 # Generate random reward (following method ensures enough variance in rewards) # 1) Generate rewards "core" for state, joint actions and agents rewards = th.randn(n_states, *joint_action_shape, 1, n_agents, generator=rand) # 2) Multiply "core" by scales to generate different rewards for next state scales_dist = Exponential(th.tensor(1.)) with mu.use_rand(rand): rewards *= scales_dist.sample( (n_states, *joint_action_shape, n_states, n_agents)) # 3) Correlate rewards rewards = rewards @ self.reward_correlation ## Transition probability self._trans_prob = trans_prob ## Rewards for state-joint actions self._rewards = rewards def reset(self): ## Current state self._state = state = th.tensor(0) # Return current state return state def step(self, actions): reward_perturbation = self.reward_perturbation n_states = self.n_states n_agents = len(self.action_space.nvec) rand = self.rand state = self._state # Validity of joint actions if not self.action_space.contains(np.array(actions)): raise ValueError("Joint actions {} is invalid".format(actions)) # Game already done if self._done: warnings.warn( "Attempting to step the environment after game is done") # Dummy step result return state, th.zeros(n_agents), True, {} # Find transition probability distribution for state-joint actions trans_prob_sa = self._trans_prob[state][actions] # Draw next state from distribution next_state_dist = Categorical(probs=trans_prob_sa) with mu.use_rand(rand): self._state = next_state = next_state_dist.sample() # Get reward perturbation perturbation_dims = reward_perturbation.dim() perturbation_idx = (state, *actions, next_state) if perturbation_dims <= len(perturbation_idx): perturbation = th.full( n_agents, reward_perturbation[perturbation_idx[:perturbation_dims]]) else: perturbation = reward_perturbation[perturbation_idx] # Sample perturbation rewards perturbation_rewards = rand.normal(0., perturbation) # Compute total rewards rewards = self._rewards[state][actions][next_state].clone() rewards += perturbation_rewards # Step result return next_state, rewards, self._done, {}