예제 #1
0
from ray.rllib.agents.a3c import A3CTrainer

N_TOPICS = 15
TOPICS = ['T{}'.format(i) for i in range(N_TOPICS)]

CONTEXT_ATTRIBUTES = {
    'hour': ['0-7', '8-9', '10-12', '13-14', '15-18', '19-21', '22-23'],
    'week period': ['Weekday', 'Weekend'],
    'weather': ['Sunny', 'Cloudy', 'Raining'],
    'device': ['mobile ios', 'mac ios', 'mobile android', 'windows']
}

OBSERVATION_0 = len(CONTEXT_ATTRIBUTES) * [0] + N_TOPICS * [0]

OBSERVATION_SPACE = Tuple(
    (Discrete(7), Discrete(2), Discrete(3), Discrete(4), Discrete(2),
     Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
     Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
     Discrete(2), Discrete(2), Discrete(2), Discrete(2)))

ACTION_SPACE = Tuple(
    (Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
     Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2),
     Discrete(2), Discrete(2), Discrete(2), Discrete(2), Discrete(2)))

# Probability of a user click based on the distance bwteen article topics
PROBAB = N_TOPICS * [0]
PROBAB[1:8] = [0.2, 0.5, 0.7, 0.4, 0.3, 0.2, 0.1]


def distance(article1, article2):
예제 #2
0
 def change_space_fn(space):
     return Discrete(space.n + 1)
예제 #3
0
        Dict({
            "obs": MultiDiscrete([2, 2, 2, 3]),
            ENV_STATE: MultiDiscrete([2, 2, 2])
        }),
    ])
    act_space = Tuple([
        TwoStepGame.action_space,
        TwoStepGame.action_space,
    ])
    register_env(
        "grouped_twostep", lambda config: TwoStepGame(config).
        with_agent_groups(grouping, obs_space=obs_space, act_space=act_space))

    if args.run == "contrib/MADDPG":
        obs_space_dict = {
            "agent_1": Discrete(6),
            "agent_2": Discrete(6),
        }
        act_space_dict = {
            "agent_1": TwoStepGame.action_space,
            "agent_2": TwoStepGame.action_space,
        }
        config = {
            "learning_starts": 100,
            "env_config": {
                "actions_are_logits": True,
            },
            "multiagent": {
                "policies": {
                    "pol1": (None, Discrete(6), TwoStepGame.action_space, {
                        "agent_id": 0,
예제 #4
0
 def __init__(self, rl_agent, fixed_agents, setting, max_steps=30):
     self.rl_agent = rl_agent
     self.action_space = Discrete(rl_agent.discretization)
     super().__init__([rl_agent], fixed_agents, setting, max_steps)
    def __init__(self, num_speedup_steps=30, require_explicit_reset=True, is_render_enabled=False,
                 early_termination_enabled=False, run_offscreen=False, save_screens=False,
                 port=2000, gpu=0, discrete_control=True, kill_when_connection_lost=True, city_name="Town01",
                 channel_last=True, action_num=2):
        EnvironmentWrapper.__init__(self, is_render_enabled, save_screens)

        print("port:", port)

        self.episode_max_time = 1000000
        self.allow_braking = True
        self.log_path = os.path.join(DEFAULT_CARLA_LOG_DIR, "CarlaLogs.txt")
        self.num_speedup_steps = num_speedup_steps
        self.is_game_ready_for_input = False
        self.run_offscreen = run_offscreen
        self.kill_when_connection_lost = kill_when_connection_lost
        # server configuration


        self.port = port
        self.gpu = gpu
        self.host = 'localhost'
        self.level = 'town1'
        self.map = CarlaLevel().get(self.level)

        # experiment = basic_experiment_suite.BasicExperimentSuite(city_name)
        experiment = CoRL2017(city_name)
        self.experiments = experiment.get_experiments()
        self.experiment_type = 0
        self.planner = Planner(city_name)

        self.car_speed = 0
        self.is_game_setup = False  # Will be true only when setup_client_and_server() is called, either explicitly, or by reset()

        # action space
        self.discrete_controls = discrete_control
        self.action_space_size = action_num
        self.action_space_high = np.array([1]*action_num)
        self.action_space_low = np.array([-1]*action_num)
        self.action_space_abs_range = np.maximum(np.abs(self.action_space_low), np.abs(self.action_space_high))
        self.steering_strength = 0.35
        self.gas_strength = 1.0
        self.brake_strength = 0.6
        self.actions = {0: [0., 0.],
                        1: [0., -self.steering_strength],
                        2: [0., self.steering_strength],
                        3: [self.gas_strength - 0.15, 0.],
                        4: [-self.brake_strength, 0],
                        5: [self.gas_strength - 0.3, -self.steering_strength],
                        6: [self.gas_strength - 0.3, self.steering_strength],
                        7: [-self.brake_strength, -self.steering_strength],
                        8: [-self.brake_strength, self.steering_strength]}
        self.actions_description = ['NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE',
                                    'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT',
                                    'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT']
        if discrete_control:
            self.action_space = Discrete(len(self.actions))
        else:
            self.action_space = Box(low=self.action_space_low, high=self.action_space_high)
        self.observation_space = Box(low=-np.inf, high=np.inf, shape=[88, 200, 3])

        # measurements
        self.measurements_size = (1,)

        self.pre_image = None
        self.first_debug = True
        self.channel_last = channel_last
예제 #6
0
 def __init__(self, venv):
     """Init."""
     super().__init__(venv)
     self.observation_space = Tuple(
         [self.observation_space, self.observation_space,
          Discrete(3)])
 def action_space(self):
     return Discrete(9)
예제 #8
0
from gym.spaces import Box, Dict, Discrete, Tuple, MultiDiscrete
import numpy as np
import unittest

import ray
from ray.rllib.agents.registry import get_trainer_class
from ray.rllib.examples.env.random_env import RandomEnv
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2
from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2
from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.test_utils import framework_iterator

ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
    "vector": Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "vector2": Box(-1.0, 1.0, (5, 5), dtype=np.float32),
    "multidiscrete": MultiDiscrete([1, 2, 3, 4]),
    "tuple": Tuple(
        [Discrete(2),
         Discrete(3),
         Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
    "dict": Dict({
        "action_choice": Discrete(3),
        "parameters": Box(-1.0, 1.0, (1, ), dtype=np.float32),
        "yet_another_nested_dict": Dict({
            "a": Tuple([Discrete(2), Discrete(3)])
        })
    }),
}
예제 #9
0
from supersuit.utils.agent_indicator import (
    change_obs_space,
    change_observation,
    get_indicator_map,
)
from gym.spaces import Box, Discrete
import numpy as np
import pytest

obs_space_3d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(4, 4, 3))
obs_space_2d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(4, 3))
obs_space_1d = Box(low=np.float32(0.0), high=np.float32(1.0), shape=(3, ))

discrete_space = Discrete(3)

NUM_INDICATORS = 11


def test_obs_space():
    assert change_obs_space(obs_space_1d,
                            NUM_INDICATORS).shape == (3 + NUM_INDICATORS, )
    assert change_obs_space(obs_space_2d, NUM_INDICATORS).shape == (
        4,
        3,
        1 + NUM_INDICATORS,
    )
    assert change_obs_space(obs_space_3d, NUM_INDICATORS).shape == (
        4,
        4,
        3 + NUM_INDICATORS,
    )
예제 #10
0
import numpy as np
from gym.spaces import Discrete, Box

action_space = Discrete(2)
observation_space = Box(-np.inf, np.inf, shape=(4, ), dtype=np.float32)
예제 #11
0
 def __init__(self, config: dict):
     self.action_space = Discrete(config["n_actions"])
     self.observation_space = config["observation_space"]
     self.sim_model = config["sim_model"]
     self.sim_config = config["sim_config"]
예제 #12
0
 def __init__(self, _):
     self.observation_space = Discrete(2)
     self.action_space = Tuple([Discrete(2), Discrete(2)])
     self.last_observation = None
예제 #13
0
 def __init__(self, config=None):
     self.env = gym.make("CartPole-v0")
     self.action_space = Discrete(2)
     self.observation_space = self.env.observation_space
 def __init__(self):
     self.max_steps = 20
     self.action_num = 4
     self.cur_step = -1
     self.observation_space = Box(0.0, 1.0, shape=(self.max_steps, ))
     self.action_space = Discrete(n=self.action_num)
예제 #15
0
파일: test_catalog.py 프로젝트: zdpau/ray-1
 def __init__(self):
     self.observation_space = Tuple(
         [Discrete(5),
          Box(0, 1, shape=(3, ), dtype=np.float32)])
예제 #16
0
파일: test3.py 프로젝트: wangyinyin1028/SCI
    def __init__(self, dim_action, dim_state, ep_length: int = 99):
        self.action_space = Discrete(dim_action)
        self.observation_space = Box(low=-np.inf,
                                     high=np.inf,
                                     shape=(dim_state, ),
                                     dtype=np.float32)
        self.ep_length = ep_length
        self.n_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.current_step = 0
        self.num_resets = -1  # Becomes 0 after __init__ exits.
        self.td_c = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 私家车时延
        self.i_mc = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 交叉口进口的社会车之和
        self.l_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 四个进口道的左转车辆数
        self.s_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.r_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.trans_l_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 四个进口道的左转车辆数
        self.trans_s_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.trans_r_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }

        self.remain_l_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 四个进口道的左转车辆数
        self.remain_s_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.remain_r_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }
        self.actal_o_m = {'1-2': np.zeros((900, 4)), '2-1': np.zeros((900, 4))}
        self.ZhuanYi1to2 = []
        self.ZhuanYi2to1 = []

        self.o_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 交叉口出口的社会车与公交车之和
        self.remain_m = {
            '出口1': np.zeros((900, 4)),
            '出口2': np.zeros((900, 4)),
            '出口3': np.zeros((900, 4)),
            '出口4': np.zeros((900, 4))
        }  # 交叉口出口的社会车与公交车之和
        self.qc1 = 0.05
        self.qc2 = 0.05
        self.C = 120
        self.vc = {'区域1': np.ones(900), '区域2': np.ones(900)}
        self.sta_flow = 0.4  # 饱和流
예제 #17
0
import numpy as np
import unittest

import ray
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.examples.env.random_env import RandomEnv
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork as FCNetV2
from ray.rllib.models.tf.visionnet import VisionNetwork as VisionNetV2
from ray.rllib.models.torch.visionnet import VisionNetwork as TorchVisionNetV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNetV2
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.test_utils import framework_iterator

ACTION_SPACES_TO_TEST = {
    "discrete":
    Discrete(5),
    "vector":
    Box(-1.0, 1.0, (5, ), dtype=np.float32),
    "vector2":
    Box(-1.0, 1.0, (5, 5), dtype=np.float32),
    "multidiscrete":
    MultiDiscrete([1, 2, 3, 4]),
    "tuple":
    Tuple([Discrete(2),
           Discrete(3),
           Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
    "dict":
    Dict({
        "action_choice":
        Discrete(3),
        "parameters":
예제 #18
0
    def test_traj_view_simple_performance(self):
        """Test whether PPOTrainer runs faster w/ `_use_trajectory_view_api`.
        """
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        action_space = Discrete(2)
        obs_space = Box(-1.0, 1.0, shape=(700, ))

        from ray.rllib.examples.env.random_env import RandomMultiAgentEnv

        from ray.tune import register_env
        register_env(
            "ma_env",
            lambda c: RandomMultiAgentEnv({
                "num_agents": 2,
                "p_done": 0.0,
                "max_episode_len": 104,
                "action_space": action_space,
                "observation_space": obs_space
            }))

        config["num_workers"] = 3
        config["num_envs_per_worker"] = 8
        config["num_sgd_iter"] = 1  # Put less weight on training.

        policies = {
            "pol0": (None, obs_space, action_space, {}),
        }

        def policy_fn(agent_id):
            return "pol0"

        config["multiagent"] = {
            "policies": policies,
            "policy_mapping_fn": policy_fn,
        }
        num_iterations = 2
        # Only works in torch so far.
        for _ in framework_iterator(config, frameworks="torch"):
            print("w/ traj. view API")
            config["_use_trajectory_view_api"] = True
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_w = 0.0
            sampler_perf_w = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_w = {
                    k:
                    sampler_perf_w.get(k, 0.0) + (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_w += delta
                print("{}={}s".format(i, delta))
            sampler_perf_w = {
                k: sampler_perf_w[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_w.items()
            }
            duration_w = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_w, sampler_perf_w,
                      learn_time_w / num_iterations))
            trainer.stop()

            print("w/o traj. view API")
            config["_use_trajectory_view_api"] = False
            trainer = ppo.PPOTrainer(config=config, env="ma_env")
            learn_time_wo = 0.0
            sampler_perf_wo = {}
            start = time.time()
            for i in range(num_iterations):
                out = trainer.train()
                ts = out["timesteps_total"]
                sampler_perf_ = out["sampler_perf"]
                sampler_perf_wo = {
                    k: sampler_perf_wo.get(k, 0.0) +
                    (sampler_perf_[k] * 1000 / ts)
                    for k, v in sampler_perf_.items()
                }
                delta = out["timers"]["learn_time_ms"] / ts
                learn_time_wo += delta
                print("{}={}s".format(i, delta))
            sampler_perf_wo = {
                k: sampler_perf_wo[k] / (num_iterations if "mean_" in k else 1)
                for k, v in sampler_perf_wo.items()
            }
            duration_wo = time.time() - start
            print("Duration: {}s "
                  "sampler-perf.={} learn-time/iter={}s".format(
                      duration_wo, sampler_perf_wo,
                      learn_time_wo / num_iterations))
            trainer.stop()

            # Assert `_use_trajectory_view_api` is faster.
            self.assertLess(sampler_perf_w["mean_raw_obs_processing_ms"],
                            sampler_perf_wo["mean_raw_obs_processing_ms"])
            self.assertLess(sampler_perf_w["mean_action_processing_ms"],
                            sampler_perf_wo["mean_action_processing_ms"])
            self.assertLess(duration_w, duration_wo)
 def action_space(self):
     physical = Discrete(8)
     # comm_disc = Tuple(list(Discrete(1) for i in range(HARVEST_COMM_BITS)))
     comm = Box(low=0.0, high=1.0, shape=(15, ), dtype=np.float32)
     total = Tuple((physical, comm))
     return total
예제 #20
0
class DiscTwoQuadrantConverter(DiscConverter):
    """
    Key:
        'Disc-2QC'

    Switching States / Actions:
        | 0: Both Transistors off.
        | 1: Upper Transistor on.
        | 2: Lower Transistor on.

    Action Space:
        Discrete(3)

    Output Voltages and Currents:
        | voltages: (0, 1)
        | currents: (-1, 1)

    Output Voltage Space:
        Box(0, 1, shape=(1,))
    """

    voltages = (0, 1)
    currents = (-1, 1)
    action_space = Discrete(3)

    def convert(self, i_out, t):
        # Docstring in base class
        # Converter switches slightly (tau / 1000 seconds) before interlocking time due to inaccuracy of the solvers.
        if t - self._tau / 1000 > self._action_start_time + self._interlocking_time:
            self._switching_state = self._switching_pattern[-1]
        else:
            self._switching_state = self._switching_pattern[0]
        if self._switching_state == 0:
            if i_out[0] < 0:
                return [1]
            elif i_out[0] >= 0:
                return [0.0]
        elif self._switching_state == 1:
            return [1]
        elif self._switching_state == 2:
            return [0.0]
        else:
            raise Exception('Invalid switching state of the converter')

    def i_sup(self, i_out):
        # Docstring in base class
        if self._switching_state == 0:
            return i_out[0] if i_out[0] < 0 else 0
        elif self._switching_state == 1:
            return i_out[0]
        elif self._switching_state == 2:
            return 0
        else:
            raise Exception('Invalid switching state of the converter')

    def _set_switching_pattern(self, *_, **__):
        # Docstring in base class
        if (self._current_action == 0 or self._switching_state == 0
                or self._current_action == self._switching_state
                or self._interlocking_time == 0):
            self._switching_pattern = [self._current_action]
            return [self._action_start_time + self._tau]
        else:
            self._switching_pattern = [0, self._current_action]
            return [
                self._action_start_time + self._interlocking_time,
                self._action_start_time + self._tau
            ]
예제 #21
0
        def policy_mapping_fn(agent_id, episode, worker, **kwargs):
            if agent_id.startswith("low_level_"):
                return "low_level_policy"
            else:
                return "high_level_policy"

        config = {
            "env": HierarchicalWindyMazeEnv,
            "num_workers": 0,
            "entropy_coeff": 0.01,
            "multiagent": {
                "policies": {
                    "high_level_policy": (
                        None,
                        maze.observation_space,
                        Discrete(4),
                        {
                            "gamma": 0.9
                        },
                    ),
                    "low_level_policy": (
                        None,
                        Tuple([maze.observation_space,
                               Discrete(4)]),
                        maze.action_space,
                        {
                            "gamma": 0.0
                        },
                    ),
                },
                "policy_mapping_fn": function(policy_mapping_fn),
예제 #22
0
class DiscB6BridgeConverter(DiscConverter):
    """
    The discrete B6 bridge converters (B6C) is simulated with three discrete 2QC.

    Key:
        'Disc-B6C'

    Actions:
        +-+-----+-----+-----+
        | |H_1  |H_2  |H_3  |
        +=+=====+=====+=====+
        |0|lower|lower|lower|
        +-+-----+-----+-----+
        |1|lower|lower|upper|
        +-+-----+-----+-----+
        |2|lower|upper|lower|
        +-+-----+-----+-----+
        |3|lower|upper|upper|
        +-+-----+-----+-----+
        |4|upper|lower|lower|
        +-+-----+-----+-----+
        |5|upper|lower|upper|
        +-+-----+-----+-----+
        |6|upper|upper|lower|
        +-+-----+-----+-----+
        |7|upper|upper|upper|
        +-+-----+-----+-----+

    Action Space:
        Discrete(8)

    Output Voltages and Currents:
        | voltages: (-1,1)
        | currents: (-1,1)

    Output Voltage Space:
        Box(-0.5, 0.5, shape=(3,))
    """

    action_space = Discrete(8)
    # Only positive voltages can be applied
    voltages = (-1, 1)
    # positive and negative currents are possible
    currents = (-1, 1)
    _reset_action = 0
    _subactions = [[2, 2, 2], [2, 2, 1], [2, 1, 2], [2, 1, 1], [1, 2, 2],
                   [1, 2, 1], [1, 1, 2], [1, 1, 1]]

    def __init__(self, tau=1e-5, **kwargs):
        # Docstring in base class
        super().__init__(tau=tau, **kwargs)
        self._subconverters = [
            DiscTwoQuadrantConverter(tau=tau, **kwargs),
            DiscTwoQuadrantConverter(tau=tau, **kwargs),
            DiscTwoQuadrantConverter(tau=tau, **kwargs),
        ]

    def reset(self):
        # Docstring in base class
        return [
            self._subconverters[0].reset()[0] - 0.5,
            self._subconverters[1].reset()[0] - 0.5,
            self._subconverters[2].reset()[0] - 0.5,
        ]

    def convert(self, i_out, t):
        # Docstring in base class
        u_out = [
            self._subconverters[0].convert([i_out[0]], t)[0] - 0.5,
            self._subconverters[1].convert([i_out[1]], t)[0] - 0.5,
            self._subconverters[2].convert([i_out[2]], t)[0] - 0.5
        ]
        return u_out

    def set_action(self, action, t):
        # Docstring in base class
        subactions = self._subactions[action]
        times = []
        times += self._subconverters[0].set_action(subactions[0], t)
        times += self._subconverters[1].set_action(subactions[1], t)
        times += self._subconverters[2].set_action(subactions[2], t)
        return sorted(list(set(times)))

    def i_sup(self, i_out):
        # Docstring in base class
        return sum([
            subconverter.i_sup([i_out_])
            for subconverter, i_out_ in zip(self._subconverters, i_out)
        ])
예제 #23
0
파일: env.py 프로젝트: zhaodan2000/MEEP
class NanoworldEnv(MultiAgentEnv):
    # Constants
    agents = ('passenger', 'driver')

    max_num_actions = 8
    destination = ["", "starbucks", "peets"]

    # Action spaces
    passenger_actions = [
        "wait for driver", "say starbucks", "say peets", "mental starbucks",
        "mental peets"
    ]
    # passenger_actions = ["wait for driver", "say starbucks", "say peets"]
    passenger_action_space = Discrete(len(passenger_actions))

    driver_actions = ["wait for passenger", "drive starbucks", "drive peets"]
    driver_action_space = Discrete(len(driver_actions))

    # observation spaces
    # wait, say starbucks, say peets -- can be repeated at most 4 times +
    # mental state (none, starbucks, peets)
    passenger_observation_space = Dict({
        'dialog_history':
        Repeated(Discrete(3), max_len=max_num_actions),
        'destination':
        Discrete(3)
    })
    # wait, say starbucks, say peets -- can be repeated at most 4 times
    driver_observation_space = Dict(
        {'dialog_history': Repeated(Discrete(3), max_len=max_num_actions)})

    def __init__(self, config):
        destination_id = random.randint(1, 2)
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.destination[destination_id])
        self.num_epidodes = 0
        # self.is_supervised = config['is_supervised']

    def reset(self):
        '''
        Called before each episode, returns the first observation
        '''
        if self.num_epidodes % 1000 == 0:
            logger.warning("completed {} episodes.".format(self.num_epidodes))

        if self.num_epidodes >= 10000:
            logger.warning('episode ' + str(self.num_epidodes))
            logger.warning('------------')
            _, _, history, _ = self.state.get_global_state()
            for h in history:
                logger.warning(h)
            logger.warning('-------------')
        self.num_epidodes += 1

        destination_id = random.randint(1, 2)
        if self.num_epidodes >= 10000:
            logger.warning('set destination: ' +
                           NanoworldEnv.destination[destination_id])
        self.state = DialogStateNano(
            NanoworldEnv.max_num_actions,
            desired_destination=NanoworldEnv.destination[destination_id])
        self.obs = {
            'driver': self.state.make_driver_observation(),
            'passenger': self.state.make_passenger_observation()
        }
        return self.obs

    def driver_step(self, action):
        self.state.update_state(NanoworldEnv.driver_actions[action])
        obs = self.state.make_driver_observation()
        return obs

    def passenger_step(self, action):
        self.state.update_state(NanoworldEnv.passenger_actions[action])
        obs = self.state.make_passenger_observation()
        return obs

    def compute_passenger_reward(self):
        # if self.is_supervised:
        #     return self.compute_episode_reward_supervised()
        # else:
        return self.compute_episode_reward()

    def compute_driver_reward(self):
        # return self.compute_episode_reward()

        driver_reward = 0
        desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state(
        )
        if self.state.dialog_complete:  # to compute at the very end
            if driven_destination:
                if len(verbal_history
                       ) == 0:  # driver drives before user says anything
                    return -1
                else:
                    last_uttered_destination = verbal_history[-1].split(" ")[1]
                    if driven_destination == last_uttered_destination:
                        return 1
                    else:
                        return -1
            else:  # timeout
                return -10
        else:
            return 0

    def compute_episode_reward(self):
        desired_destination, verbal_history, all_actions, driven_destination = self.state.get_global_state(
        )
        if self.state.dialog_complete:  # to compute at the very end
            if driven_destination:
                if desired_destination == driven_destination:
                    return 1
                else:
                    return -1
            else:  # timeout
                return -10
        else:
            return 0

    def step(self, action_dict):
        '''
        Given an action_dict, compute the next observation, rewards, and dones
        '''

        if 'driver' in action_dict:
            driver_obs = self.driver_step(action_dict['driver'])
            if self.state.is_done():
                driver_reward = self.compute_driver_reward()
                return {'driver': driver_obs, 'passenger': self.state.make_passenger_observation()}, \
                       {'driver': driver_reward, 'passenger': self.compute_passenger_reward()}, \
                       {'__all__': self.state.is_done()}, {}

        if 'passenger' in action_dict:
            passenger_obs = self.passenger_step(action_dict['passenger'])

        self.obs = {'driver': driver_obs, 'passenger': passenger_obs}
        self.rewards = {
            'driver': self.compute_driver_reward(),
            'passenger': self.compute_passenger_reward()
        }
        self.dones = {'__all__': self.state.is_done()}
        self.infos = {}
        return self.obs, self.rewards, self.dones, self.infos
예제 #24
0
    def __init__(self, env_config):
        # Static Parameters
        self.size = 10
        self.reward_density = .1
        self.penalty_density = .02
        self.obs_size = 5
        self.max_global_steps = (self.size * 2)**2
        self.log_frequency = 10
        self.action_dict = {
            0: 'move 1',  # Move one block forward
            1: 'turn 1',  # Turn 90 degrees to the right
            2: 'turn -1',  # Turn 90 degrees to the left
            3: 'attack 1',  # Destroy block
            4: 'jumpmove 1'  # Jump up and move forward 1 block
        }
        self.blocks_dict = {
            "redstone_ore": 1,
            "coal_ore": 2,
            "emerald_ore": 3,
            "iron_ore": 4,
            "gold_ore": 5,
            "diamond_ore": 6,
            "lava": -1,
            "flowing_lava": -1
        }

        # Rllib Parameters
        self.action_space = Discrete(len(self.action_dict))
        self.observation_space = Box(-1,
                                     6,
                                     shape=(np.prod(
                                         [2, self.obs_size, self.obs_size]), ),
                                     dtype=np.int32)

        # Malmo Parameters
        self.agent_host = MalmoPython.AgentHost()
        try:
            self.agent_host.parse(sys.argv)
        except RuntimeError as e:
            print('ERROR:', e)
            print(self.agent_host.getUsage())
            exit(1)

        # ResourceCollector Parameters
        self.obs = None
        self.obsdict = None  # Stores last json loaded observation
        self.episode_step = 0
        self.episode_return = 0
        self.returns = []
        self.resources_collected = {
            "diamond": [0],
            "redstone": [0],
            "coal": [0],
            "emerald": [0],
            "iron_ore": [0],
            "gold_ore": [0]
        }
        self.deaths = []
        self.death_occurred = False
        self.steps = []
        self.episode_start = time.time()
        self.episode_end = time.time()
예제 #25
0
from .dummy_gym_env import DummyEnv
from gym.spaces import Box, Discrete
import numpy as np
from supersuit import frame_stack_v0, reshape_v0, observation_lambda_v0, action_lambda_v0, dtype_v0
import supersuit
import pytest

base_obs = (np.zeros([8, 8, 3]) + np.arange(3)).astype(np.float32)
base_obs_space = Box(low=np.float32(0.), high=np.float32(10.), shape=[8, 8, 3])
base_act_spaces = Discrete(5)


def test_reshape():
    base_env = DummyEnv(base_obs, base_obs_space, base_act_spaces)
    env = reshape_v0(base_env, (64, 3))
    obs = env.reset()
    assert obs.shape == (64, 3)
    first_obs, _, _, _ = env.step(5)
    assert np.all(np.equal(first_obs, base_obs.reshape([64, 3])))


def new_continuous_dummy():
    base_act_spaces = Box(low=np.float32(0.), high=np.float32(10.), shape=[3])
    return DummyEnv(base_obs, base_obs_space, base_act_spaces)


def new_dummy():
    return DummyEnv(base_obs, base_obs_space, base_act_spaces)


wrappers = [
예제 #26
0
    def __init__(self,
                 grid_size: tuple,
                 n_players: int = 2,
                 max_turns=100,
                 final_reward=100,
                 piece_types=None,
                 policies_other=None,
                 observe_all=False,
                 multi_discrete_actions=False,
                 flat_observations=False,
                 render=False,
                 cell_size=50,
                 padding=5,
                 ui_font_size=14,
                 seed=None):
        """

        :param grid_size: tuple specifying the dimensions of the game's board.
        :param n_players: number of players participating in the game.
        :param max_turns: maximum number of turns per episode.
        :param final_reward: amount of final reward given to the winner and taken from the losers.
        :param piece_types: list of dict configs containing describing possible pieces.
        :param policies_other: list of policies to use for opponents players.
        :param observe_all: whether to return observations on `step()` for all players in the info dict or not.
        :param multi_discrete_actions: whether to use a multi-discrete action space.
        :param flat_observations: whether to flatten the observations or return as tensor.
        :param render: enables rendering when calling `render()`.
        :param cell_size: width/height of a cell when rendering.
        :param padding: padding between cells when rendering.
        :param ui_font_size: size of the ui font when rendering.
        :param seed: random seed.
        """
        grid_size = tuple(grid_size)
        if policies_other is not None:
            assert n_players - 1 == len(
                policies_other), 'please provide a policy for each opponent.'

        self.n_players = n_players
        self.policies_other = policies_other
        self.observe_all = observe_all

        if piece_types is None:
            self.piece_types = self._get_default_piece_types()
        else:
            self.piece_types = piece_types
        n_piece_types = len(self.piece_types)

        # actions: (cursor move direction, piece_type)
        # where (cursor move direction) encodes +1 or -1 movement along an axis and 0 for no movement.
        n_move_directions = 1 + 2 * len(grid_size)
        if multi_discrete_actions:
            self.action_space = MultiDiscrete(
                [n_move_directions, n_piece_types])
        else:
            self.action_space = Discrete(n_move_directions * n_piece_types)

        # observation space:
        # (d_0 * ... * d_n * piece_type * player
        # + cursor_d_0 + ... + cursor_d_n + population + room)
        k_cursor_features = len(grid_size) if flat_observations else 1
        obs_dims = grid_size + (1 + (n_piece_types - 1) * n_players, )
        self.observation_space = OneHotBox(OneHot(obs_dims),
                                           Box(0.0,
                                               1.0,
                                               shape=(2 +
                                                      k_cursor_features, )),
                                           flatten=flat_observations)

        self.game = ExpandoGame(grid_size,
                                n_players,
                                max_turns,
                                final_reward=final_reward,
                                piece_types=self.piece_types,
                                seed=seed)
        self.observation_format = 'flat' if flat_observations else 'grid'
        self.do_render = render
        if self.do_render:
            self.renderer = GameRenderer(self.game, cell_size, padding,
                                         ui_font_size)

        self.seed(seed)
예제 #27
0
 def __init__(self):
     space = Discrete(5)
     super().__init__(space)
예제 #28
0
 def action_space(self):
     return Discrete(len(Actions))
예제 #29
0
class TwoStepGame(MultiAgentEnv):
    action_space = Discrete(2)

    def __init__(self, env_config):
        self.state = None
        self.agent_1 = 0
        self.agent_2 = 1
        # MADDPG emits action logits instead of actual discrete actions
        self.actions_are_logits = env_config.get("actions_are_logits", False)
        self.one_hot_state_encoding = env_config.get("one_hot_state_encoding",
                                                     False)
        self.with_state = env_config.get("separate_state_space", False)

        if not self.one_hot_state_encoding:
            self.observation_space = Discrete(6)
            self.with_state = False
        else:
            # Each agent gets the full state (one-hot encoding of which of the
            # three states are active) as input with the receiving agent's
            # ID (1 or 2) concatenated onto the end.
            if self.with_state:
                self.observation_space = Dict({
                    "obs":
                    MultiDiscrete([2, 2, 2, 3]),
                    ENV_STATE:
                    MultiDiscrete([2, 2, 2])
                })
            else:
                self.observation_space = MultiDiscrete([2, 2, 2, 3])

    def reset(self):
        self.state = np.array([1, 0, 0])
        return self._obs()

    def step(self, action_dict):
        if self.actions_are_logits:
            action_dict = {
                k: np.random.choice([0, 1], p=v)
                for k, v in action_dict.items()
            }

        state_index = np.flatnonzero(self.state)
        if state_index == 0:
            action = action_dict[self.agent_1]
            assert action in [0, 1], action
            if action == 0:
                self.state = np.array([0, 1, 0])
            else:
                self.state = np.array([0, 0, 1])
            global_rew = 0
            done = False
        elif state_index == 1:
            global_rew = 7
            done = True
        else:
            if action_dict[self.agent_1] == 0 and action_dict[
                    self.agent_2] == 0:
                global_rew = 0
            elif action_dict[self.agent_1] == 1 and action_dict[
                    self.agent_2] == 1:
                global_rew = 8
            else:
                global_rew = 1
            done = True

        rewards = {
            self.agent_1: global_rew / 2.0,
            self.agent_2: global_rew / 2.0
        }
        obs = self._obs()
        dones = {"__all__": done}
        infos = {}
        return obs, rewards, dones, infos

    def _obs(self):
        if self.with_state:
            return {
                self.agent_1: {
                    "obs": self.agent_1_obs(),
                    ENV_STATE: self.state
                },
                self.agent_2: {
                    "obs": self.agent_2_obs(),
                    ENV_STATE: self.state
                }
            }
        else:
            return {
                self.agent_1: self.agent_1_obs(),
                self.agent_2: self.agent_2_obs()
            }

    def agent_1_obs(self):
        if self.one_hot_state_encoding:
            return np.concatenate([self.state, [1]])
        else:
            return np.flatnonzero(self.state)[0]

    def agent_2_obs(self):
        if self.one_hot_state_encoding:
            return np.concatenate([self.state, [2]])
        else:
            return np.flatnonzero(self.state)[0] + 3
예제 #30
0
if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)
    register_env("NestedSpaceRepeatAfterMeEnv",
                 lambda c: NestedSpaceRepeatAfterMeEnv(c))

    config = {
        "env": "NestedSpaceRepeatAfterMeEnv",
        "env_config": {
            "space":
            Dict({
                "a":
                Tuple([Dict({
                    "d": Box(-10.0, 10.0, ()),
                    "e": Discrete(2)
                })]),
                "b":
                Box(-10.0, 10.0, (2, )),
                "c":
                Discrete(4)
            }),
        },
        "entropy_coeff": 0.00005,  # We don't want high entropy in this Env.
        "gamma": 0.0,  # No history in Env (bandit problem).
        "lr": 0.0005,
        "num_envs_per_worker": 20,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "num_sgd_iter": 4,
        "num_workers": 0,