from ray.rllib.agents.a3c import A3CTrainer N_TOPICS = 15 TOPICS = ['T{}'.format(i) for i in range(N_TOPICS)] CONTEXT_ATTRIBUTES = { 'hour': ['0-7', '8-9', '10-12', '13-14', '15-18', '19-21', '22-23'], 'week period': ['Weekday', 'Weekend'], 'weather': ['Sunny', 'Cloudy', 'Raining'], 'device': ['mobile ios', 'mac ios', 'mobile android', 'windows'] } OBSERVATION_0 = len(CONTEXT_ATTRIBUTES) * [0] + N_TOPICS * [0] OBSERVATION_SPACE = Tuple( (Discrete(7), Discrete(2), Discrete(3), Discrete(4), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2))) ACTION_SPACE = Tuple( (Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2))) # Probability of a user click based on the distance bwteen article topics PROBAB = N_TOPICS * [0] PROBAB[1:8] = [0.2, 0.5, 0.7, 0.4, 0.3, 0.2, 0.1] def distance(article1, article2):
def change_space_fn(space): return Discrete(space.n + 1)
Dict({ "obs": MultiDiscrete([2, 2, 2, 3]), ENV_STATE: MultiDiscrete([2, 2, 2]) }), ]) act_space = Tuple([ TwoStepGame.action_space, TwoStepGame.action_space, ]) register_env( "grouped_twostep", lambda config: TwoStepGame(config). with_agent_groups(grouping, obs_space=obs_space, act_space=act_space)) if args.run == "contrib/MADDPG": obs_space_dict = { "agent_1": Discrete(6), "agent_2": Discrete(6), } act_space_dict = { "agent_1": TwoStepGame.action_space, "agent_2": TwoStepGame.action_space, } config = { "learning_starts": 100, "env_config": { "actions_are_logits": True, }, "multiagent": { "policies": { "pol1": (None, Discrete(6), TwoStepGame.action_space, { "agent_id": 0,
def __init__(self, rl_agent, fixed_agents, setting, max_steps=30): self.rl_agent = rl_agent self.action_space = Discrete(rl_agent.discretization) super().__init__([rl_agent], fixed_agents, setting, max_steps)
def __init__(self, num_speedup_steps=30, require_explicit_reset=True, is_render_enabled=False, early_termination_enabled=False, run_offscreen=False, save_screens=False, port=2000, gpu=0, discrete_control=True, kill_when_connection_lost=True, city_name="Town01", channel_last=True, action_num=2): EnvironmentWrapper.__init__(self, is_render_enabled, save_screens) print("port:", port) self.episode_max_time = 1000000 self.allow_braking = True self.log_path = os.path.join(DEFAULT_CARLA_LOG_DIR, "CarlaLogs.txt") self.num_speedup_steps = num_speedup_steps self.is_game_ready_for_input = False self.run_offscreen = run_offscreen self.kill_when_connection_lost = kill_when_connection_lost # server configuration self.port = port self.gpu = gpu self.host = 'localhost' self.level = 'town1' self.map = CarlaLevel().get(self.level) # experiment = basic_experiment_suite.BasicExperimentSuite(city_name) experiment = CoRL2017(city_name) self.experiments = experiment.get_experiments() self.experiment_type = 0 self.planner = Planner(city_name) self.car_speed = 0 self.is_game_setup = False # Will be true only when setup_client_and_server() is called, either explicitly, or by reset() # action space self.discrete_controls = discrete_control self.action_space_size = action_num self.action_space_high = np.array([1]*action_num) self.action_space_low = np.array([-1]*action_num) self.action_space_abs_range = np.maximum(np.abs(self.action_space_low), np.abs(self.action_space_high)) self.steering_strength = 0.35 self.gas_strength = 1.0 self.brake_strength = 0.6 self.actions = {0: [0., 0.], 1: [0., -self.steering_strength], 2: [0., self.steering_strength], 3: [self.gas_strength - 0.15, 0.], 4: [-self.brake_strength, 0], 5: [self.gas_strength - 0.3, -self.steering_strength], 6: [self.gas_strength - 0.3, self.steering_strength], 7: [-self.brake_strength, -self.steering_strength], 8: [-self.brake_strength, self.steering_strength]} self.actions_description = ['NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE', 'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT', 'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT'] if discrete_control: self.action_space = Discrete(len(self.actions)) else: self.action_space = Box(low=self.action_space_low, high=self.action_space_high) self.observation_space = Box(low=-np.inf, high=np.inf, shape=[88, 200, 3]) # measurements self.measurements_size = (1,) self.pre_image = None self.first_debug = True self.channel_last = channel_last
def __init__(self, venv): """Init.""" super().__init__(venv) self.observation_space = Tuple( [self.observation_space, self.observation_space, Discrete(3)])
def action_space(self): return Discrete(9)
from gym.spaces import Box, Dict, Discrete, Tuple, MultiDiscrete import numpy as np import unittest import ray from ray.rllib.agents.registry import get_trainer_class from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2 from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.test_utils import framework_iterator ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple( [Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), "dict": Dict({ "action_choice": Discrete(3), "parameters": Box(-1.0, 1.0, (1, ), dtype=np.float32), "yet_another_nested_dict": Dict({ "a": Tuple([Discrete(2), Discrete(3)]) }) }), }
from supersuit.utils.agent_indicator import ( change_obs_space, change_observation, get_indicator_map, ) from gym.spaces import Box, Discrete import numpy as np import pytest obs_space_3d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(4, 4, 3)) obs_space_2d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(4, 3)) obs_space_1d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(3, )) discrete_space = Discrete(3) NUM_INDICATORS = 11 def test_obs_space(): assert change_obs_space(obs_space_1d, NUM_INDICATORS).shape == (3 + NUM_INDICATORS, ) assert change_obs_space(obs_space_2d, NUM_INDICATORS).shape == ( 4, 3, 1 + NUM_INDICATORS, ) assert change_obs_space(obs_space_3d, NUM_INDICATORS).shape == ( 4, 4, 3 + NUM_INDICATORS, )
import numpy as np from gym.spaces import Discrete, Box action_space = Discrete(2) observation_space = Box(-np.inf, np.inf, shape=(4, ), dtype=np.float32)
def __init__(self, config: dict): self.action_space = Discrete(config["n_actions"]) self.observation_space = config["observation_space"] self.sim_model = config["sim_model"] self.sim_config = config["sim_config"]
def __init__(self, _): self.observation_space = Discrete(2) self.action_space = Tuple([Discrete(2), Discrete(2)]) self.last_observation = None
def __init__(self, config=None): self.env = gym.make("CartPole-v0") self.action_space = Discrete(2) self.observation_space = self.env.observation_space
def __init__(self): self.max_steps = 20 self.action_num = 4 self.cur_step = -1 self.observation_space = Box(0.0, 1.0, shape=(self.max_steps, )) self.action_space = Discrete(n=self.action_num)
def __init__(self): self.observation_space = Tuple( [Discrete(5), Box(0, 1, shape=(3, ), dtype=np.float32)])
def __init__(self, dim_action, dim_state, ep_length: int = 99): self.action_space = Discrete(dim_action) self.observation_space = Box(low=-np.inf, high=np.inf, shape=(dim_state, ), dtype=np.float32) self.ep_length = ep_length self.n_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.current_step = 0 self.num_resets = -1 # Becomes 0 after __init__ exits. self.td_c = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 私家车时延 self.i_mc = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 交叉口进口的社会车之和 self.l_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 四个进口道的左转车辆数 self.s_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.r_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.trans_l_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 四个进口道的左转车辆数 self.trans_s_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.trans_r_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.remain_l_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 四个进口道的左转车辆数 self.remain_s_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.remain_r_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } self.actal_o_m = {'1-2': np.zeros((900, 4)), '2-1': np.zeros((900, 4))} self.ZhuanYi1to2 = [] self.ZhuanYi2to1 = [] self.o_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 交叉口出口的社会车与公交车之和 self.remain_m = { '出口1': np.zeros((900, 4)), '出口2': np.zeros((900, 4)), '出口3': np.zeros((900, 4)), '出口4': np.zeros((900, 4)) } # 交叉口出口的社会车与公交车之和 self.qc1 = 0.05 self.qc2 = 0.05 self.C = 120 self.vc = {'区域1': np.ones(900), '区域2': np.ones(900)} self.sta_flow = 0.4 # 饱和流
import numpy as np import unittest import ray from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.random_env import RandomEnv from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2 from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2 from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2 from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.test_utils import framework_iterator ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32), "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32), "multidiscrete": MultiDiscrete([1, 2, 3, 4]), "tuple": Tuple([Discrete(2), Discrete(3), Box(-1.0, 1.0, (5, ), dtype=np.float32)]), "dict": Dict({ "action_choice": Discrete(3), "parameters":
def test_traj_view_simple_performance(self): """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`. """ config = copy.deepcopy(ppo.DEFAULT_CONFIG) action_space = Discrete(2) obs_space = Box(-1.0, 1.0, shape=(700, )) from ray.rllib.examples.env.random_env import RandomMultiAgentEnv from ray.tune import register_env register_env( "ma_env", lambda c: RandomMultiAgentEnv({ "num_agents": 2, "p_done": 0.0, "max_episode_len": 104, "action_space": action_space, "observation_space": obs_space })) config["num_workers"] = 3 config["num_envs_per_worker"] = 8 config["num_sgd_iter"] = 1 # Put less weight on training. policies = { "pol0": (None, obs_space, action_space, {}), } def policy_fn(agent_id): return "pol0" config["multiagent"] = { "policies": policies, "policy_mapping_fn": policy_fn, } num_iterations = 2 # Only works in torch so far. for _ in framework_iterator(config, frameworks="torch"): print("w/ traj. view API") config["_use_trajectory_view_api"] = True trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_w = 0.0 sampler_perf_w = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_w = { k: sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_w += delta print("{}={}s".format(i, delta)) sampler_perf_w = { k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_w.items() } duration_w = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_w, sampler_perf_w, learn_time_w / num_iterations)) trainer.stop() print("w/o traj. view API") config["_use_trajectory_view_api"] = False trainer = ppo.PPOTrainer(config=config, env="ma_env") learn_time_wo = 0.0 sampler_perf_wo = {} start = time.time() for i in range(num_iterations): out = trainer.train() ts = out["timesteps_total"] sampler_perf_ = out["sampler_perf"] sampler_perf_wo = { k: sampler_perf_wo.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts) for k, v in sampler_perf_.items() } delta = out["timers"]["learn_time_ms"] / ts learn_time_wo += delta print("{}={}s".format(i, delta)) sampler_perf_wo = { k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1) for k, v in sampler_perf_wo.items() } duration_wo = time.time() - start print("Duration: {}s " "sampler-perf.={} learn-time/iter={}s".format( duration_wo, sampler_perf_wo, learn_time_wo / num_iterations)) trainer.stop() # Assert `_use_trajectory_view_api` is faster. self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"], sampler_perf_wo["mean_raw_obs_processing_ms"]) self.assertLess(sampler_perf_w["mean_action_processing_ms"], sampler_perf_wo["mean_action_processing_ms"]) self.assertLess(duration_w, duration_wo)
def action_space(self): physical = Discrete(8) # comm_disc = Tuple(list(Discrete(1) for i in range(HARVEST_COMM_BITS))) comm = Box(low=0.0, high=1.0, shape=(15, ), dtype=np.float32) total = Tuple((physical, comm)) return total
class DiscTwoQuadrantConverter(DiscConverter): """ Key: 'Disc-2QC' Switching States / Actions: | 0: Both Transistors off. | 1: Upper Transistor on. | 2: Lower Transistor on. Action Space: Discrete(3) Output Voltages and Currents: | voltages: (0, 1) | currents: (-1, 1) Output Voltage Space: Box(0, 1, shape=(1,)) """ voltages = (0, 1) currents = (-1, 1) action_space = Discrete(3) def convert(self, i_out, t): # Docstring in base class # Converter switches slightly (tau / 1000 seconds) before interlocking time due to inaccuracy of the solvers. if t - self._tau / 1000 > self._action_start_time + self._interlocking_time: self._switching_state = self._switching_pattern[-1] else: self._switching_state = self._switching_pattern[0] if self._switching_state == 0: if i_out[0] < 0: return [1] elif i_out[0] >= 0: return [0.0] elif self._switching_state == 1: return [1] elif self._switching_state == 2: return [0.0] else: raise Exception('Invalid switching state of the converter') def i_sup(self, i_out): # Docstring in base class if self._switching_state == 0: return i_out[0] if i_out[0] < 0 else 0 elif self._switching_state == 1: return i_out[0] elif self._switching_state == 2: return 0 else: raise Exception('Invalid switching state of the converter') def _set_switching_pattern(self, *_, **__): # Docstring in base class if (self._current_action == 0 or self._switching_state == 0 or self._current_action == self._switching_state or self._interlocking_time == 0): self._switching_pattern = [self._current_action] return [self._action_start_time + self._tau] else: self._switching_pattern = [0, self._current_action] return [ self._action_start_time + self._interlocking_time, self._action_start_time + self._tau ]
def policy_mapping_fn(agent_id, episode, worker, **kwargs): if agent_id.startswith("low_level_"): return "low_level_policy" else: return "high_level_policy" config = { "env": HierarchicalWindyMazeEnv, "num_workers": 0, "entropy_coeff": 0.01, "multiagent": { "policies": { "high_level_policy": ( None, maze.observation_space, Discrete(4), { "gamma": 0.9 }, ), "low_level_policy": ( None, Tuple([maze.observation_space, Discrete(4)]), maze.action_space, { "gamma": 0.0 }, ), }, "policy_mapping_fn": function(policy_mapping_fn),
class DiscB6BridgeConverter(DiscConverter): """ The discrete B6 bridge converters (B6C) is simulated with three discrete 2QC. Key: 'Disc-B6C' Actions: +-+-----+-----+-----+ | |H_1 |H_2 |H_3 | +=+=====+=====+=====+ |0|lower|lower|lower| +-+-----+-----+-----+ |1|lower|lower|upper| +-+-----+-----+-----+ |2|lower|upper|lower| +-+-----+-----+-----+ |3|lower|upper|upper| +-+-----+-----+-----+ |4|upper|lower|lower| +-+-----+-----+-----+ |5|upper|lower|upper| +-+-----+-----+-----+ |6|upper|upper|lower| +-+-----+-----+-----+ |7|upper|upper|upper| +-+-----+-----+-----+ Action Space: Discrete(8) Output Voltages and Currents: | voltages: (-1,1) | currents: (-1,1) Output Voltage Space: Box(-0.5, 0.5, shape=(3,)) """ action_space = Discrete(8) # Only positive voltages can be applied voltages = (-1, 1) # positive and negative currents are possible currents = (-1, 1) _reset_action = 0 _subactions = [[2, 2, 2], [2, 2, 1], [2, 1, 2], [2, 1, 1], [1, 2, 2], [1, 2, 1], [1, 1, 2], [1, 1, 1]] def __init__(self, tau=1e-5, **kwargs): # Docstring in base class super().__init__(tau=tau, **kwargs) self._subconverters = [ DiscTwoQuadrantConverter(tau=tau, **kwargs), DiscTwoQuadrantConverter(tau=tau, **kwargs), DiscTwoQuadrantConverter(tau=tau, **kwargs), ] def reset(self): # Docstring in base class return [ self._subconverters[0].reset()[0] - 0.5, self._subconverters[1].reset()[0] - 0.5, self._subconverters[2].reset()[0] - 0.5, ] def convert(self, i_out, t): # Docstring in base class u_out = [ self._subconverters[0].convert([i_out[0]], t)[0] - 0.5, self._subconverters[1].convert([i_out[1]], t)[0] - 0.5, self._subconverters[2].convert([i_out[2]], t)[0] - 0.5 ] return u_out def set_action(self, action, t): # Docstring in base class subactions = self._subactions[action] times = [] times += self._subconverters[0].set_action(subactions[0], t) times += self._subconverters[1].set_action(subactions[1], t) times += self._subconverters[2].set_action(subactions[2], t) return sorted(list(set(times))) def i_sup(self, i_out): # Docstring in base class return sum([ subconverter.i_sup([i_out_]) for subconverter, i_out_ in zip(self._subconverters, i_out) ])
class NanoworldEnv(MultiAgentEnv): # Constants agents = ('passenger', 'driver') max_num_actions = 8 destination = ["", "starbucks", "peets"] # Action spaces passenger_actions = [ "wait for driver", "say starbucks", "say peets", "mental starbucks", "mental peets" ] # passenger_actions = ["wait for driver", "say starbucks", "say peets"] passenger_action_space = Discrete(len(passenger_actions)) driver_actions = ["wait for passenger", "drive starbucks", "drive peets"] driver_action_space = Discrete(len(driver_actions)) # observation spaces # wait, say starbucks, say peets -- can be repeated at most 4 times + # mental state (none, starbucks, peets) passenger_observation_space = Dict({ 'dialog_history': Repeated(Discrete(3), max_len=max_num_actions), 'destination': Discrete(3) }) # wait, say starbucks, say peets -- can be repeated at most 4 times driver_observation_space = Dict( {'dialog_history': Repeated(Discrete(3), max_len=max_num_actions)}) def __init__(self, config): destination_id = random.randint(1, 2) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.destination[destination_id]) self.num_epidodes = 0 # self.is_supervised = config['is_supervised'] def reset(self): ''' Called before each episode, returns the first observation ''' if self.num_epidodes % 1000 == 0: logger.warning("completed {} episodes.".format(self.num_epidodes)) if self.num_epidodes >= 10000: logger.warning('episode ' + str(self.num_epidodes)) logger.warning('------------') _, _, history, _ = self.state.get_global_state() for h in history: logger.warning(h) logger.warning('-------------') self.num_epidodes += 1 destination_id = random.randint(1, 2) if self.num_epidodes >= 10000: logger.warning('set destination: ' + NanoworldEnv.destination[destination_id]) self.state = DialogStateNano( NanoworldEnv.max_num_actions, desired_destination=NanoworldEnv.destination[destination_id]) self.obs = { 'driver': self.state.make_driver_observation(), 'passenger': self.state.make_passenger_observation() } return self.obs def driver_step(self, action): self.state.update_state(NanoworldEnv.driver_actions[action]) obs = self.state.make_driver_observation() return obs def passenger_step(self, action): self.state.update_state(NanoworldEnv.passenger_actions[action]) obs = self.state.make_passenger_observation() return obs def compute_passenger_reward(self): # if self.is_supervised: # return self.compute_episode_reward_supervised() # else: return self.compute_episode_reward() def compute_driver_reward(self): # return self.compute_episode_reward() driver_reward = 0 desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state( ) if self.state.dialog_complete: # to compute at the very end if driven_destination: if len(verbal_history ) == 0: # driver drives before user says anything return -1 else: last_uttered_destination = verbal_history[-1].split(" ")[1] if driven_destination == last_uttered_destination: return 1 else: return -1 else: # timeout return -10 else: return 0 def compute_episode_reward(self): desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state( ) if self.state.dialog_complete: # to compute at the very end if driven_destination: if desired_destination == driven_destination: return 1 else: return -1 else: # timeout return -10 else: return 0 def step(self, action_dict): ''' Given an action_dict, compute the next observation, rewards, and dones ''' if 'driver' in action_dict: driver_obs = self.driver_step(action_dict['driver']) if self.state.is_done(): driver_reward = self.compute_driver_reward() return {'driver': driver_obs, 'passenger': self.state.make_passenger_observation()}, \ {'driver': driver_reward, 'passenger': self.compute_passenger_reward()}, \ {'__all__': self.state.is_done()}, {} if 'passenger' in action_dict: passenger_obs = self.passenger_step(action_dict['passenger']) self.obs = {'driver': driver_obs, 'passenger': passenger_obs} self.rewards = { 'driver': self.compute_driver_reward(), 'passenger': self.compute_passenger_reward() } self.dones = {'__all__': self.state.is_done()} self.infos = {} return self.obs, self.rewards, self.dones, self.infos
def __init__(self, env_config): # Static Parameters self.size = 10 self.reward_density = .1 self.penalty_density = .02 self.obs_size = 5 self.max_global_steps = (self.size * 2)**2 self.log_frequency = 10 self.action_dict = { 0: 'move 1', # Move one block forward 1: 'turn 1', # Turn 90 degrees to the right 2: 'turn -1', # Turn 90 degrees to the left 3: 'attack 1', # Destroy block 4: 'jumpmove 1' # Jump up and move forward 1 block } self.blocks_dict = { "redstone_ore": 1, "coal_ore": 2, "emerald_ore": 3, "iron_ore": 4, "gold_ore": 5, "diamond_ore": 6, "lava": -1, "flowing_lava": -1 } # Rllib Parameters self.action_space = Discrete(len(self.action_dict)) self.observation_space = Box(-1, 6, shape=(np.prod( [2, self.obs_size, self.obs_size]), ), dtype=np.int32) # Malmo Parameters self.agent_host = MalmoPython.AgentHost() try: self.agent_host.parse(sys.argv) except RuntimeError as e: print('ERROR:', e) print(self.agent_host.getUsage()) exit(1) # ResourceCollector Parameters self.obs = None self.obsdict = None # Stores last json loaded observation self.episode_step = 0 self.episode_return = 0 self.returns = [] self.resources_collected = { "diamond": [0], "redstone": [0], "coal": [0], "emerald": [0], "iron_ore": [0], "gold_ore": [0] } self.deaths = [] self.death_occurred = False self.steps = [] self.episode_start = time.time() self.episode_end = time.time()
from .dummy_gym_env import DummyEnv from gym.spaces import Box, Discrete import numpy as np from supersuit import frame_stack_v0, reshape_v0, observation_lambda_v0, action_lambda_v0, dtype_v0 import supersuit import pytest base_obs = (np.zeros([8, 8, 3]) + np.arange(3)).astype(np.float32) base_obs_space = Box(low=np.float32(0.), high=np.float32(10.), shape=[8, 8, 3]) base_act_spaces = Discrete(5) def test_reshape(): base_env = DummyEnv(base_obs, base_obs_space, base_act_spaces) env = reshape_v0(base_env, (64, 3)) obs = env.reset() assert obs.shape == (64, 3) first_obs, _, _, _ = env.step(5) assert np.all(np.equal(first_obs, base_obs.reshape([64, 3]))) def new_continuous_dummy(): base_act_spaces = Box(low=np.float32(0.), high=np.float32(10.), shape=[3]) return DummyEnv(base_obs, base_obs_space, base_act_spaces) def new_dummy(): return DummyEnv(base_obs, base_obs_space, base_act_spaces) wrappers = [
def __init__(self, grid_size: tuple, n_players: int = 2, max_turns=100, final_reward=100, piece_types=None, policies_other=None, observe_all=False, multi_discrete_actions=False, flat_observations=False, render=False, cell_size=50, padding=5, ui_font_size=14, seed=None): """ :param grid_size: tuple specifying the dimensions of the game's board. :param n_players: number of players participating in the game. :param max_turns: maximum number of turns per episode. :param final_reward: amount of final reward given to the winner and taken from the losers. :param piece_types: list of dict configs containing describing possible pieces. :param policies_other: list of policies to use for opponents players. :param observe_all: whether to return observations on `step()` for all players in the info dict or not. :param multi_discrete_actions: whether to use a multi-discrete action space. :param flat_observations: whether to flatten the observations or return as tensor. :param render: enables rendering when calling `render()`. :param cell_size: width/height of a cell when rendering. :param padding: padding between cells when rendering. :param ui_font_size: size of the ui font when rendering. :param seed: random seed. """ grid_size = tuple(grid_size) if policies_other is not None: assert n_players - 1 == len( policies_other), 'please provide a policy for each opponent.' self.n_players = n_players self.policies_other = policies_other self.observe_all = observe_all if piece_types is None: self.piece_types = self._get_default_piece_types() else: self.piece_types = piece_types n_piece_types = len(self.piece_types) # actions: (cursor move direction, piece_type) # where (cursor move direction) encodes +1 or -1 movement along an axis and 0 for no movement. n_move_directions = 1 + 2 * len(grid_size) if multi_discrete_actions: self.action_space = MultiDiscrete( [n_move_directions, n_piece_types]) else: self.action_space = Discrete(n_move_directions * n_piece_types) # observation space: # (d_0 * ... * d_n * piece_type * player # + cursor_d_0 + ... + cursor_d_n + population + room) k_cursor_features = len(grid_size) if flat_observations else 1 obs_dims = grid_size + (1 + (n_piece_types - 1) * n_players, ) self.observation_space = OneHotBox(OneHot(obs_dims), Box(0.0, 1.0, shape=(2 + k_cursor_features, )), flatten=flat_observations) self.game = ExpandoGame(grid_size, n_players, max_turns, final_reward=final_reward, piece_types=self.piece_types, seed=seed) self.observation_format = 'flat' if flat_observations else 'grid' self.do_render = render if self.do_render: self.renderer = GameRenderer(self.game, cell_size, padding, ui_font_size) self.seed(seed)
def __init__(self): space = Discrete(5) super().__init__(space)
def action_space(self): return Discrete(len(Actions))
class TwoStepGame(MultiAgentEnv): action_space = Discrete(2) def __init__(self, env_config): self.state = None self.agent_1 = 0 self.agent_2 = 1 # MADDPG emits action logits instead of actual discrete actions self.actions_are_logits = env_config.get("actions_are_logits", False) self.one_hot_state_encoding = env_config.get("one_hot_state_encoding", False) self.with_state = env_config.get("separate_state_space", False) if not self.one_hot_state_encoding: self.observation_space = Discrete(6) self.with_state = False else: # Each agent gets the full state (one-hot encoding of which of the # three states are active) as input with the receiving agent's # ID (1 or 2) concatenated onto the end. if self.with_state: self.observation_space = Dict({ "obs": MultiDiscrete([2, 2, 2, 3]), ENV_STATE: MultiDiscrete([2, 2, 2]) }) else: self.observation_space = MultiDiscrete([2, 2, 2, 3]) def reset(self): self.state = np.array([1, 0, 0]) return self._obs() def step(self, action_dict): if self.actions_are_logits: action_dict = { k: np.random.choice([0, 1], p=v) for k, v in action_dict.items() } state_index = np.flatnonzero(self.state) if state_index == 0: action = action_dict[self.agent_1] assert action in [0, 1], action if action == 0: self.state = np.array([0, 1, 0]) else: self.state = np.array([0, 0, 1]) global_rew = 0 done = False elif state_index == 1: global_rew = 7 done = True else: if action_dict[self.agent_1] == 0 and action_dict[ self.agent_2] == 0: global_rew = 0 elif action_dict[self.agent_1] == 1 and action_dict[ self.agent_2] == 1: global_rew = 8 else: global_rew = 1 done = True rewards = { self.agent_1: global_rew / 2.0, self.agent_2: global_rew / 2.0 } obs = self._obs() dones = {"__all__": done} infos = {} return obs, rewards, dones, infos def _obs(self): if self.with_state: return { self.agent_1: { "obs": self.agent_1_obs(), ENV_STATE: self.state }, self.agent_2: { "obs": self.agent_2_obs(), ENV_STATE: self.state } } else: return { self.agent_1: self.agent_1_obs(), self.agent_2: self.agent_2_obs() } def agent_1_obs(self): if self.one_hot_state_encoding: return np.concatenate([self.state, [1]]) else: return np.flatnonzero(self.state)[0] def agent_2_obs(self): if self.one_hot_state_encoding: return np.concatenate([self.state, [2]]) else: return np.flatnonzero(self.state)[0] + 3
if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) register_env("NestedSpaceRepeatAfterMeEnv", lambda c: NestedSpaceRepeatAfterMeEnv(c)) config = { "env": "NestedSpaceRepeatAfterMeEnv", "env_config": { "space": Dict({ "a": Tuple([Dict({ "d": Box(-10.0, 10.0, ()), "e": Discrete(2) })]), "b": Box(-10.0, 10.0, (2, )), "c": Discrete(4) }), }, "entropy_coeff": 0.00005, # We don't want high entropy in this Env. "gamma": 0.0, # No history in Env (bandit problem). "lr": 0.0005, "num_envs_per_worker": 20, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "num_sgd_iter": 4, "num_workers": 0,