def __init__(self, environment_filename, no_graphics): engine_configuration_channel = EngineConfigurationChannel() self.env = UnityEnvironment( file_name=environment_filename, side_channels=[engine_configuration_channel], no_graphics=no_graphics) self.env.reset() self.brain_name = self.env.get_agent_groups() self.group_spec = self.env.get_agent_group_spec(self.brain_name[0]) engine_configuration_channel.set_configuration_parameters( width=640, height=480, time_scale=3.0) self.group_name = self.brain_name # Set observation and action spaces if self.group_spec.is_action_discrete(): self._action_space = [] branches = self.group_spec.discrete_action_branches # if self.group_spec.action_shape == 1: for _ in range(2): self._action_space.append([ spaces.Discrete(branches[i]) for i in range(len(branches)) ]) else: high = np.array([1] * self.group_spec.action_shape) self._action_space = spaces.Box(-high, high, dtype=np.float32) high = np.array([np.inf] * self._get_vec_obs_size()) self._observation_space = spaces.Box(-high, high, dtype=np.float32)
def initialize_env(self, config, env_file) -> Environment: """ Initialize the environment. Args: config: the configuration parameters. env_file: the environment file. Returns: env: Environment """ # [3] Environment configuration base_port = int(input("Enter base port: ")) time_scale = int(config.get("time_scale")) width = int(config.get("width")) height = int(config.get("height")) channel_config = EngineConfigurationChannel() channel_param = EnvironmentParametersChannel() env = Environment( file_name=env_file, base_port=base_port, side_channels=[channel_config, channel_param], ) channel_config.set_configuration_parameters(time_scale=time_scale, quality_level=1, width=width, height=height) env.set_float_parameters(config) return env
def train(): engine_configuration_channel = EngineConfigurationChannel() # 時間スケールを20倍に設定 engine_configuration_channel.set_configuration_parameters(time_scale=20.0) unity_env = UnityEnvironment("./ml-agents/Project/PushBlock", side_channels=[engine_configuration_channel]) env = UnityToGymWrapper(unity_env, 0, flatten_branched=True) logger.configure('./logs') # DQNで学習 model = deepq.learn( env, "mlp", seed=0, lr=2.5e-4, total_timesteps=400000, buffer_size=50000, exploration_fraction=0.05, exploration_final_eps=0.1, print_freq=20, train_freq=5, learning_starts=20000, target_network_update_freq=50, gamma=0.99, prioritized_replay=False, checkpoint_freq=1000, dueling=True, checkpoint_path=None, load_path="./model" ) # モデルを保存 save_path = "./model" ckpt = tf.train.Checkpoint(model=model) manager = tf.train.CheckpointManager(ckpt, save_path, max_to_keep=1) manager.save()
def __init__(self, unity_env, time_scale=1.0, width=720, height=480, target_frame_rate=60, quality_level=5): """ Initializes the game :param unity_env: (UnityEnvironment) Environment where the game will be played :param time_scale:(float) Speed of the game :param width:(int) Window's width :param height:(int) Window's height :param target_frame_rate:(int) Frame rate :param quality_level:(int) Visual quality Todo: Commentate a little, reorganise """ self.unity_env = unity_env self.unity_env.reset() engine_configuration_channel = EngineConfigurationChannel() engine_configuration_channel.set_configuration_parameters( time_scale=time_scale, width=width, height=height, target_frame_rate=target_frame_rate, quality_level=quality_level) self.unity_env.side_channels[2] = engine_configuration_channel self.group_name = unity_env.get_agent_groups()[0] self.group_spec = unity_env.get_agent_group_spec(self.group_name) self.n_agents = self.unity_env.get_step_result( self.group_name).n_agents() self.action_size = self.group_spec.action_size
def make_unity_env(self, env_name, float_params=dict(), time_scale=1, seed=time.time(), worker_id=None, **kwargs): """ creates a gym environment from a unity game env_name: str the path to the game float_params: dict or None this should be a dict of argument settings for the unity environment keys: varies by environment time_scale: float argument to set Unity's time scale. This applies less to gym wrapped versions of Unity Environments, I believe.. but I'm not sure seed: int the seed for randomness worker_id: int must specify a unique worker id for each unity process on this machine """ if float_params is None: float_params = dict() path = os.path.expanduser(env_name) channel = EngineConfigurationChannel() env_channel = EnvironmentParametersChannel() channel.set_configuration_parameters(time_scale=1) for k, v in float_params.items(): if k == "validation" and v >= 1: print("Game in validation mode") env_channel.set_float_parameter(k, float(v)) if worker_id is None: worker_id = seed % 500 + 1 env_made = False n_loops = 0 worker_id = 0 while not env_made and n_loops < 50: try: env = UnityEnvironment(file_name=path, side_channels=[channel, env_channel], worker_id=worker_id, seed=seed) env_made = True except: s = "Error encountered making environment, " s += "trying new worker_id" print(s) worker_id = (worker_id + 1 + int(np.random.random() * 100)) % 500 try: env.close() except: pass n_loops += 1 env = UnityToGymWrapper(env, allow_multiple_obs=True) return env
class FQ_Env(object): def __init__(self): self.engine_configuration_channel = EngineConfigurationChannel() self.env = UnityEnvironment(side_channels=[self.engine_configuration_channel]) self.engine_configuration_channel.set_configuration_parameters( # width = 84, # height = 84, # quality_level = 5, #1-5 time_scale = 1 # 1-100 # target_frame_rate = 60, #1-60 # capture_frame_rate = 60 #default 60 ) self.reset() self.n = self.agent_num() self.state_shapes = [self.env.get_behavior_spec(behavior_name).observation_shapes[0][0] for behavior_name in self.env.get_behavior_names()] self.action_dims = [self.env.get_behavior_spec(behavior_name).action_shape for behavior_name in self.env.get_behavior_names()] def agent_num(self): behavior_names = self.env.get_behavior_names() agent_num = len(behavior_names) return agent_num def reset(self): self.env.reset() cur_state = [] for behavior_name in self.env.get_behavior_names(): DecisionSteps, TerminalSteps = self.env.get_steps(behavior_name) cur_state.append(DecisionSteps.obs[0][0]) return cur_state def step(self, actions): next_state = [] reward = [] done = [] for behavior_name_index, behavior_name in enumerate(self.env.get_behavior_names()): self.env.set_actions(behavior_name=behavior_name, action=np.asarray([actions[behavior_name_index]])) self.env.step() for behavior_name in self.env.get_behavior_names(): DecisionSteps, TerminalSteps = self.env.get_steps(behavior_name) if len(TerminalSteps.reward) == 0: next_state.append(DecisionSteps.obs[0][0]) reward.append(DecisionSteps.reward[0]) done.append(False) else: next_state.append(TerminalSteps.obs[0][0]) reward.append(TerminalSteps.reward[0]) done.append(True) return next_state, reward, done def close(self): self.env.close()
def main(): """ file_name: is the name of the environment binary (located in the root directory of the python project) worker_id: indicates which port to use for communication with the environment. For use in parallel training regimes such as A3C. seed: indicates the seed to use when generating random numbers during the training process. In environments which are deterministic, setting the seed enables reproducible experimentation by ensuring that the environment and trainers utilize the same random seed. side_channels: provides a way to exchange data with the Unity simulation that is not related to the reinforcement learning loop. For example: configurations or properties. More on them in the "Modifying the environment from Python"(https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Python-API.md#modifying-the-environment-from-python) section. --- env.reset() env.step() env.close() """ channel = EngineConfigurationChannel() filename = "Mummy" env = UnityEnvironment(file_name=filename, seed=1, side_channels=[channel]) channel.set_configuration_parameters(time_scale=2.0) env.reset() behavior_names = env.behavior_specs.keys() for name in behavior_names: print('behavior_name:', name) # Mummy?team=0 decision_steps, terminal_steps = env.get_steps( behavior_name="Mummy?team=0") """ print('DecisionSteps') print('- observation:', decision_steps.obs) print('- reward:', decision_steps.reward) print('- agent_id:', decision_steps.agent_id) print('- action_mask:', decision_steps.action_mask) print('TerminalSteps') print('- observation:', terminal_steps.obs) print('- reward:', terminal_steps.reward) print('- agent_id:', terminal_steps.agent_id) print('- interrupted:', terminal_steps.interrupted) """ while True: for i in decision_steps.agent_id: if i in terminal_steps.agent_id: continue env.set_action_for_agent(behavior_name="Mummy?team=0", agent_id=i, action=np.random.uniform(-1.0, 1.0, size=(2, ))) env.step() decision_steps, terminal_steps = env.get_steps( behavior_name="Mummy?team=0")
class UnityWrapper(object): def __init__(self, env_args): self.engine_configuration_channel = EngineConfigurationChannel() if env_args['train_mode']: self.engine_configuration_channel.set_configuration_parameters( time_scale=env_args['train_time_scale']) else: self.engine_configuration_channel.set_configuration_parameters( width=env_args['width'], height=env_args['height'], quality_level=env_args['quality_level'], time_scale=env_args['inference_time_scale'], target_frame_rate=env_args['target_frame_rate']) self.float_properties_channel = EnvironmentParametersChannel() if env_args['file_path'] is None: self._env = UnityEnvironment(base_port=5004, seed=env_args['env_seed'], side_channels=[ self.engine_configuration_channel, self.float_properties_channel ]) else: unity_env_dict = load_yaml('/'.join( [os.getcwd(), 'rls', 'envs', 'unity_env_dict.yaml'])) self._env = UnityEnvironment( file_name=env_args['file_path'], base_port=env_args['port'], no_graphics=not env_args['render'], seed=env_args['env_seed'], side_channels=[ self.engine_configuration_channel, self.float_properties_channel ], additional_args=[ '--scene', str( unity_env_dict.get(env_args.get('env_name', 'Roller'), 'None')), '--n_agents', str(env_args.get('env_num', 1)) ]) self.reset_config = env_args['reset_config'] def reset(self, **kwargs): reset_config = kwargs.get('reset_config', None) or self.reset_config for k, v in reset_config.items(): self.float_properties_channel.set_float_parameter(k, v) self._env.reset() def __getattr__(self, name): if name.startswith('_'): raise AttributeError( "attempted to get missing private attribute '{}'".format(name)) return getattr(self._env, name)
def _create_env(self, env_file, time_scale, no_graphics): channel = EngineConfigurationChannel() env = UnityEnvironment( file_name=env_file, no_graphics=no_graphics, side_channels=[channel], # See if setting a worker id allows me to spin up more agents worker_id=proc_id(), ) channel.set_configuration_parameters( time_scale=time_scale, ) return env
def initialize_all_side_channels(self, initialize_config, engine_config): """ 初始化所有的通讯频道 """ engine_configuration_channel = EngineConfigurationChannel() engine_configuration_channel.set_configuration_parameters( **engine_config) float_properties_channel = EnvironmentParametersChannel() float_properties_channel.set_float_parameter('env_copies', self._n_copies) for k, v in initialize_config.items(): float_properties_channel.set_float_parameter(k, v) return dict(engine_configuration_channel=engine_configuration_channel, float_properties_channel=float_properties_channel)
def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.betas = (0.9, 0.999) self.gamma = 0.99 self.eps_clip = 0.2 self.buffer_size = 2048 self.batch_size = 256 self.K_epochs = 3 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=True) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents print("Nº of Agents: ", self.n_agents) # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, betas=self.betas) self.MseLoss = nn.MSELoss() # Buffer memory self.memory = [] for _ in range(self.n_agents): self.memory.append(Buffer()) # Initialize time step (for updating when buffer_size is full) self.t_step = 1
def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.buffer_size = 10240 self.batch_size = 1024 self.gamma = 0.99 self.update_every = 64 self.max_steps = 100000 self.epsilon = 1.0 self.epsilon_end = 0.01 self.epsilon_decay = 0.995 self.tau = 0.01 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=False) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents # Models self.local_model = QNetwork(self.state_size, self.action_size, seed=0).to(device) self.target_model = QNetwork(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.local_model.parameters(), lr=self.learning_rate) # Buffer memory self.memory = Buffer(self.buffer_size, self.batch_size, seed=0, device=device) # Initialize time step (for updating every "update_every" time steps) self.t_step = 0
def initialize_all_side_channels(self, kwargs): ''' 初始化所有的通讯频道 ''' engine_configuration_channel = EngineConfigurationChannel() engine_configuration_channel.set_configuration_parameters( width=kwargs['width'], height=kwargs['height'], quality_level=kwargs['quality_level'], time_scale=1 if bool(kwargs.get('inference', False)) else kwargs['time_scale'], target_frame_rate=kwargs['target_frame_rate'], capture_frame_rate=kwargs['capture_frame_rate']) float_properties_channel = EnvironmentParametersChannel() for k, v in kwargs.get('initialize_config', {}).items(): float_properties_channel.set_float_parameter(k, v) return dict(engine_configuration_channel=engine_configuration_channel, float_properties_channel=float_properties_channel)
def play(): engine_configuration_channel = EngineConfigurationChannel() # 時間スケールを10倍に設定 engine_configuration_channel.set_configuration_parameters(time_scale=10.0) unity_env = UnityEnvironment("./ml-agents/Project/PushBlock", side_channels=[engine_configuration_channel]) env = UnityToGymWrapper(unity_env, 0, flatten_branched=True) # モデル読み込み model = deepq.learn(env, "mlp", total_timesteps=0, load_path="./model") obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) while True: action, _, _, _ = model.step(tf.constant(obs)) action = action[0].numpy() obs, rew, done, _ = env.step(action) if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0)
def test_set_action_multi_agent(): engine_config_channel = EngineConfigurationChannel() env = default_registry[BALL_ID].make( base_port=6001, worker_id=0, no_graphics=True, side_channels=[engine_config_channel], ) engine_config_channel.set_configuration_parameters(time_scale=100) for _ in range(3): env.reset() behavior_name = list(env.behavior_specs.keys())[0] d, t = env.get_steps(behavior_name) for _ in range(50): action = np.ones((len(d), 2)) action_tuple = ActionTuple() action_tuple.add_continuous(action) env.set_actions(behavior_name, action_tuple) env.step() d, t = env.get_steps(behavior_name) env.close()
def initialise_environment(self): """Initialise and reset unity environment""" engine_configuration_channel = EngineConfigurationChannel() self.float_properties_channel = FloatPropertiesChannel() self.env = UnityEnvironment(file_name=self.env_path, base_port=5004, side_channels=[ engine_configuration_channel, self.float_properties_channel ]) # Reset the environment self.env.reset() # Set the default brain to work with self.group_name = self.env.get_agent_groups()[0] self.group_spec = self.env.get_agent_group_spec(self.group_name) # Set the time scale of the engine engine_configuration_channel.set_configuration_parameters( time_scale=self.time_scale)
def test_engine_configuration(): sender = EngineConfigurationChannel() # We use a raw bytes channel to interpred the data receiver = RawBytesChannel(sender.channel_id) config = EngineConfig.default_config() sender.set_configuration(config) data = SideChannelManager([sender]).generate_side_channel_messages() SideChannelManager([receiver]).process_side_channel_message(data) received_data = receiver.get_and_clear_received_messages() assert len(received_data) == 5 # 5 different messages one for each setting sent_time_scale = 4.5 sender.set_configuration_parameters(time_scale=sent_time_scale) data = SideChannelManager([sender]).generate_side_channel_messages() SideChannelManager([receiver]).process_side_channel_message(data) message = IncomingMessage(receiver.get_and_clear_received_messages()[0]) message.read_int32() time_scale = message.read_float32() assert time_scale == sent_time_scale with pytest.raises(UnitySideChannelException): sender.set_configuration_parameters(width=None, height=42) with pytest.raises(UnityCommunicationException): # try to send data to the EngineConfigurationChannel sender.set_configuration_parameters(time_scale=sent_time_scale) data = SideChannelManager([sender]).generate_side_channel_messages() SideChannelManager([sender]).process_side_channel_message(data)
def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.gamma = 0.99 self.batch_size = 256 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics = False, multiagent = True) channel.set_configuration_parameters(time_scale = 100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents print("Nº of Agents: ",self.n_agents) # Shared model self.shared_model = ActorCritic(self.state_size, self.action_size, seed = 0).to(device) # Agents models self.agent_model = [] self.optimizer = [] for i in range(self.n_agents): self.agent_model.append(ActorCritic(self.state_size, self.action_size, seed = 0).to(device)) self.optimizer.append(optim.Adam(self.agent_model[i].parameters(), lr=self.learning_rate)) # Buffer memory self.memory = [] for _ in range(self.n_agents): self.memory.append(Buffer()) # Initialize time step (for updating every "batch_size" time steps) self.t_step = 1
def main(params): config = vars(parser.parse_args()) channel = EngineConfigurationChannel() unity_env = UnityEnvironment(file_name=None, side_channels=[channel]) channel.set_configuration_parameters(time_scale=20.0) env = UnityToGymWrapper(unity_env) agent = DDQN(env, cfg['agent']) tag = 'DDQN' # Initiate the tracker for stats tracker = Tracker("TurtleBot3", tag, seed, cfg['agent'], ['Epoch', 'Ep_Reward']) # Train the agent agent.train(tracker, n_episodes=config['epochs'], verbose=config['verbose'], params=cfg['agent'], hyperp=config)
def unity_env_fn(agent_file, time_scale, no_graphics, worker_id): """Wrapper function for making unity environment with custom speed and graphics options. Args: agent_file (str): path to the environment binary time_scale (float): speed at which to run the simulation no_graphics (bool): whether or not to show the simulation Returns: Gym environment. """ channel = EngineConfigurationChannel() unity_env = UnityEnvironment( file_name=agent_file, no_graphics=no_graphics, side_channels=[channel], worker_id=worker_id, ) channel.set_configuration_parameters(time_scale=time_scale, ) env = UnityToGymWrapper(unity_env) return env
def main(): agent_file = "3DBall_single/3DBall_single.x86_64" no_graphics = True channel = EngineConfigurationChannel() unity_env = UnityEnvironment(file_name=agent_file, seed=1, no_graphics=no_graphics, side_channels=[channel]) channel.set_configuration_parameters(time_scale=50., ) env = UnityToGymWrapper(unity_env) l1, l2 = 64, 64 activation = nn.ReLU output_activation = nn.Tanh ac = TD3ActorCritic(env.observation_space, env.action_space, l1, l2, activation=activation) params = dict( gamma=0.99, polyak=0.995, act_noise=0.1, target_noise=0.2, epochs=100, steps_per_epoch=4000, start_steps=10000, batch_size=256, update_after=10000, update_every=50, policy_delay=2, lr=1e-3, ) model = TD3(ac=ac, env=env, **params) model.train()
class MyEnv(gym.Env): def __init__(self, worker_id, realtime_mode=False): self.reset_parameters = EnvironmentParametersChannel() self.engine_config = EngineConfigurationChannel() env_path = "C:/myDesktop/source/gridworld_imitation/food_collector_4" self._env = UnityEnvironment( env_path, worker_id, side_channels=[self.reset_parameters, self.engine_config]) self._env.reset() self.behavior_name = list(self._env.behavior_specs)[0] behavior_spec = self._env.behavior_specs[self.behavior_name] print(behavior_spec) if realtime_mode: self.engine_config.set_configuration_parameters(time_scale=1.0) self.reset_parameters.set_float_parameter("train-mode", 0.0) else: self.engine_config.set_configuration_parameters(time_scale=20.0) self.reset_parameters.set_float_parameter("train-mode", 1.0) self._flattener = ActionFlattener( behavior_spec.action_spec.discrete_branches) def reset(self): # for key, value in reset_params.items(): # self.reset_parameters.set_float_parameter(key, value) self._env.reset() info, terminal_info = self._env.get_steps(self.behavior_name) self.game_over = False obs, reward, done, info = self._single_step(info, terminal_info) return obs def step(self, action): # Use random actions for all other agents in environment. if self._flattener is not None and type(action) == int: # Translate action into list action = np.array(self._flattener.lookup_action(action)) c_action = Action(action) self._env.set_actions(self.behavior_name, c_action) self._env.step() running_info, terminal_info = self._env.get_steps(self.behavior_name) obs, reward, done, info = self._single_step(running_info, terminal_info) self.game_over = done return obs, reward, done, info def _single_step(self, info, terminal_info): if len(terminal_info) == 0: done = False use_info = info else: done = True use_info = terminal_info # 카메라, 센서 순으로 나옴 output_info = {} output_info["visual_obs"] = use_info.obs[0][0] #obs = np.concatenate([use_info.obs[1][0], use_info.obs[2][0]]) return use_info.obs[1][0], use_info.reward[0], done, output_info def close(self): self._env.close() def render(self): pass
class UnityWrapper(Env): """This class wraps Unity environments. This wrapper has notable constraints: - Only one agent (no multi-agent environments). - Only one visual observation - Only discrete and multi-discrete action spaces (no continuous action space)""" def __init__(self, env_path, reset_params, worker_id = 1, no_graphis = False, realtime_mode = False, record_trajectory = False): """Instantiates the Unity Environment from a specified executable. Arguments: env_path {string} -- Path to the executable of the environment reset_params {dict} -- Reset parameters of the environment such as the seed Keyword Arguments: worker_id {int} -- Port of the environment"s instance (default: {1}) no_graphis {bool} -- Whether to allow the executable to render or not (default: {False}) realtime_mode {bool} -- Whether to run the environment in real time or as fast as possible (default: {False}) record_trajectory {bool} -- Whether to record the trajectory of an entire episode. This can be used for video recording. (default: {False}) """ # Initialize channels self.reset_parameters = EnvironmentParametersChannel() self.engine_config = EngineConfigurationChannel() # Prepare default reset parameters self._default_reset_parameters = {} for key, value in reset_params.items(): self._default_reset_parameters[key] = value if key != "start-seed" or key != "num-seeds": self.reset_parameters.set_float_parameter(key, value) self._realtime_mode = realtime_mode if realtime_mode: self.engine_config.set_configuration_parameters(time_scale=1.0, width=1280, height=720) else: self.engine_config.set_configuration_parameters(time_scale=30.0, width=256, height=256) # Whether to record the trajectory of an entire episode self._record = record_trajectory # Launch the environment's executable self._env = UnityEnvironment(file_name = env_path, worker_id = worker_id, no_graphics = no_graphis, side_channels=[self.reset_parameters, self.engine_config]) # If the Unity Editor chould be used instead of a build # self._env = UnityEnvironment(file_name = None, worker_id = 0, no_graphics = no_graphis, side_channels=[self.reset_parameters, self.engine_config]) # Reset the environment self._env.reset() # Retrieve behavior configuration self._behavior_name = list(self._env.behavior_specs)[0] self._behavior_spec = self._env.behavior_specs[self._behavior_name] # Check whether this Unity environment is supported self._verify_environment() # Set action space properties if self._behavior_spec.action_spec.is_discrete(): num_action_branches = self._behavior_spec.action_spec.discrete_size action_branch_dimensions = self._behavior_spec.action_spec.discrete_branches if num_action_branches == 1: self._action_space = spaces.Discrete(action_branch_dimensions[0]) else: self._action_space = spaces.MultiDiscrete(action_branch_dimensions) # Count visual and vector observations self._num_vis_obs, self._num_vec_obs = 0, 0 self._vec_obs_indices = [] for index, obs in enumerate(self._behavior_spec.observation_specs): if len(obs) > 1: self._num_vis_obs = self._num_vis_obs + 1 self._vis_obs_index = index else: self._num_vec_obs = self._num_vec_obs + 1 self._vec_obs_indices.append(index) # Set visual observation space property if self._num_vis_obs == 1: vis_obs_shape = self._behavior_spec.observation_specs[self._vis_obs_index].shape self._visual_observation_space = spaces.Box( low = 0, high = 1.0, shape = vis_obs_shape, dtype = np.float32) else: self._visual_observation_space = None # Set vector observation space property if self._num_vec_obs > 0: # Determine the length of vec obs by summing the length of each distinct one vec_obs_length = sum([self._behavior_spec.observation_specs[i][0] for i in self._vec_obs_indices]) self._vector_observatoin_space = (vec_obs_length, ) else: self._vector_observatoin_space = None # Videos can only be recorded if the environment provides visual observations if self._record and self._visual_observation_space is None: UnityEnvironmentException("Videos cannot be rendered for a Unity environment that does not provide visual observations.") @property def unwrapped(self): """ Returns: {UnityWrapper} -- Environment in its vanilla (i.e. unwrapped) state """ return self @property def action_space(self): """Returns the shape of the action space of the agent.""" return self._action_space @property def action_names(self): return None @property def get_episode_trajectory(self): """Returns the trajectory of an entire episode as dictionary (vis_obs, vec_obs, rewards, actions). """ self._trajectory["action_names"] = self.action_names return self._trajectory if self._trajectory else None @property def visual_observation_space(self): return self._visual_observation_space @property def vector_observation_space(self): return self._vector_observatoin_space def reset(self, reset_params = None): """Resets the environment based on a global or just specified config. Keyword Arguments: config {dict} -- Reset parameters to configure the environment (default: {None}) Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation """ # Track rewards of an entire episode self._rewards = [] # Use initial or new reset parameters if reset_params is None: reset_params = self._default_reset_parameters else: reset_params = reset_params # Apply reset parameters for key, value in reset_params.items(): # Skip reset parameters that are not used by the Unity environment if key != "start-seed" or key != "num-seeds": self.reset_parameters.set_float_parameter(key, value) # Sample the to be used seed if reset_params["start-seed"] > -1: seed = randint(reset_params["start-seed"], reset_params["start-seed"] + reset_params["num-seeds"] - 1) else: # Use unlimited seeds seed = -1 self.reset_parameters.set_float_parameter("seed", seed) # Reset and verify the environment self._env.reset() info, terminal_info = self._env.get_steps(self._behavior_name) self._verify_environment() # Retrieve initial observations vis_obs, vec_obs, _, _ = self._process_agent_info(info, terminal_info) # Prepare trajectory recording self._trajectory = { "vis_obs": [vis_obs * 255], "vec_obs": [vec_obs], "rewards": [0.0], "actions": [] } return vis_obs, vec_obs def step(self, action): """Runs one timestep of the environment"s dynamics. Once an episode is done, reset() has to be called manually. Arguments: action {List} -- A list of at least one discrete action to be executed by the agent Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation {float} -- (Total) Scalar reward signaled by the environment {bool} -- Whether the episode of the environment terminated {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed """ # Carry out the agent's action action_tuple = ActionTuple() action_tuple.add_discrete(np.asarray(action).reshape([1, -1])) self._env.set_actions(self._behavior_name, action_tuple) self._env.step() info, terminal_info = self._env.get_steps(self._behavior_name) # Process step results vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info) self._rewards.append(reward) # Record trajectory data if self._record: self._trajectory["vis_obs"].append(vis_obs * 255) self._trajectory["vec_obs"].append(vec_obs) self._trajectory["rewards"].append(reward) self._trajectory["actions"].append(action) # Episode information if done: info = {"reward": sum(self._rewards), "length": len(self._rewards)} else: info = None return vis_obs, vec_obs, reward, done, info def close(self): """Shut down the environment.""" self._env.close() def _process_agent_info(self, info, terminal_info): """Extracts the observations, rewards, dones, and episode infos. Args: info {DecisionSteps}: Current state terminal_info {TerminalSteps}: Terminal state Returns: vis_obs {ndarray} -- Visual observation if available, else None vec_obs {ndarray} -- Vector observation if available, else None reward {float} -- Reward signal from the environment done {bool} -- Whether the episode terminated or not """ # Determine if the episode terminated or not if len(terminal_info) == 0: done = False use_info = info else: done = True use_info = terminal_info # Process visual observations if self.visual_observation_space is not None: vis_obs = use_info.obs[self._vis_obs_index][0] else: vis_obs = None # Process vector observations if self.vector_observation_space is not None: for i, dim in enumerate(self._vec_obs_indices): if i == 0: vec_obs = use_info.obs[dim][0] else: vec_obs = np.concatenate((vec_obs, use_info.obs[dim][0])) else: vec_obs = None return vis_obs, vec_obs, use_info.reward[0], done def _verify_environment(self): # Verify number of agent behavior types if len(self._env.behavior_specs) != 1: raise UnityEnvironmentException("The unity environment containts more than one agent type.") # Verify number of agents decision_steps, _ = self._env.get_steps(self._behavior_name) if len(decision_steps) > 1: raise UnityEnvironmentException("The unity environment contains more than one agent, which is not supported.") # Verify action space type if not self._behavior_spec.action_spec.is_discrete() or self._behavior_spec.action_spec.is_continuous(): raise UnityEnvironmentException("Continuous action spaces are not supported. " "Only discrete and MultiDiscrete spaces are supported.") # Verify that at least one observation is provided num_vis_obs = 0 num_vec_obs = 0 for obs_spec in self._behavior_spec.observation_specs: if len(obs_spec.shape) == 3: num_vis_obs += 1 elif(len(obs_spec.shape)) == 1: num_vec_obs += 1 if num_vis_obs == 0 and num_vec_obs == 0: raise UnityEnvironmentException("The unity environment does not contain any observations.") # Verify number of visual observations if num_vis_obs > 1: raise UnityEnvironmentException("The unity environment contains more than one visual observation.")
class UnityWrapper(Env): """This class wraps Unity environments. This wrapper has notable constraints: - Only one agent (no multi-agent environments). - Only one visual observation - Only discrete and multi-discrete action spaces (no continuous action space)""" def __init__(self, env_path, worker_id = 1, no_graphis = False, realtime_mode = False, config = None): """Instantiates the Unity Environment from a specified executable. Arguments: env_path {string} -- Path to the executable of the environment Keyword Arguments: worker_id {int} -- Port of the environment"s instance (default: {1}) no_graphis {bool} -- Whether to allow the executable to render or not (default: {False}) realtime_mode {bool} -- Whether to run the environment in real time or as fast as possible (default: {False}) config {dict} -- Specifies the reset parameters of the environment (default: {None}) """ # Disable logging logging.disable(logging.INFO) # Initialize channels self.reset_parameters = EnvironmentParametersChannel() self.engine_config = EngineConfigurationChannel() self._config = config self._realtime_mode = realtime_mode if realtime_mode: self.engine_config.set_configuration_parameters(time_scale=1.0, width=1280, height=720) else: self.engine_config.set_configuration_parameters(time_scale=20.0, width=128, height=128) # Launch the environment's executable self._env = UnityEnvironment(file_name = env_path, worker_id = worker_id, no_graphics = no_graphis, side_channels=[self.reset_parameters, self.engine_config]) # Reset the environment self._env.reset() # Retrieve behavior configuration self._behavior_name = list(self._env.behavior_specs)[0] self._behavior_spec = self._env.behavior_specs[self._behavior_name] # Set action space properties if len(self._behavior_spec.action_shape) == 1: self._action_space = spaces.Discrete(self._behavior_spec.action_shape[0]) else: self._action_space = spaces.MultiDiscrete(self._behavior_spec.action_shape) self._action_names = ["Not available"] # Count visual and vector observations self._num_vis_obs, self._num_vec_obs = 0, 0 self._vec_obs_indices = [] for index, obs in enumerate(self._behavior_spec.observation_shapes): if len(obs) > 1: self._num_vis_obs = self._num_vis_obs + 1 self._vis_obs_index = index else: self._num_vec_obs = self._num_vec_obs + 1 self._vec_obs_indices.append(index) # Verify the environment self._verify_environment() # Set visual observation space property if self._num_vis_obs == 1: height = self._behavior_spec.observation_shapes[self._vis_obs_index][0] width = self._behavior_spec.observation_shapes[self._vis_obs_index][1] depth = self._behavior_spec.observation_shapes[self._vis_obs_index][2] self._visual_observation_space = spaces.Box( low = 0, high = 1.0, shape = (height, width, depth), dtype = np.float32) else: self._visual_observation_space = None # Set vector observation space property if self._num_vec_obs > 0: # Determine the length of vec obs by summing the length of each distinct one vec_obs_length = sum([self._behavior_spec.observation_shapes[i][0] for i in self._vec_obs_indices]) self._vector_observatoin_space = (vec_obs_length, ) else: self._vector_observatoin_space = None @property def unwrapped(self): """ Returns: {UnityWrapper} -- Environment in its vanilla (i.e. unwrapped) state """ return self @property def action_space(self): """Returns the shape of the action space of the agent.""" return self._action_space @property def action_names(self): return self._action_names @property def visual_observation_space(self): return self._visual_observation_space @property def vector_observation_space(self): return self._vector_observatoin_space def reset(self, reset_params = None): """Resets the environment based on a global or just specified config. Keyword Arguments: config {dict} -- Reset parameters to configure the environment (default: {None}) Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation """ # Track rewards of an entire episode self._rewards = [] # Process config: Either load global or new config (if specified) if reset_params is None: reset_params = {} if self._config is not None: reset_params = self._config else: reset_params = reset_params # Apply reset parameters for key, value in reset_params.items(): self.reset_parameters.set_float_parameter(key, value) # Reset and verify the environment self._env.reset() info, terminal_info = self._env.get_steps(self._behavior_name) self._verify_environment(len(info)) # Retrieve initial observations vis_obs, vec_obs, _, _ = self._process_agent_info(info, terminal_info) return vis_obs, vec_obs def step(self, action): """Runs one timestep of the environment"s dynamics. Once an episode is done, reset() has to be called manually. Arguments: action {List} -- A list of at least one discrete action to be executed by the agent Returns: {numpy.ndarray} -- Visual observation {numpy.ndarray} -- Vector observation {float} -- (Total) Scalar reward signaled by the environment {bool} -- Whether the episode of the environment terminated {dict} -- Further episode information (e.g. cumulated reward) retrieved from the environment once an episode completed """ # Carry out the agent's action self._env.set_actions(self._behavior_name, action.reshape([1, -1])) self._env.step() info, terminal_info = self._env.get_steps(self._behavior_name) # Process step results vis_obs, vec_obs, reward, done = self._process_agent_info(info, terminal_info) self._rewards.append(reward) # Episode information if done: info = {"reward": sum(self._rewards), "length": len(self._rewards)} else: info = None return vis_obs, vec_obs, reward, done, info def close(self): """Shut down the environment.""" self._env.close() def _process_agent_info(self, info, terminal_info): """Extracts the observations, rewards, dones, and episode infos. Args: info {DecisionSteps}: Current state terminal_info {TerminalSteps}: Terminal state Returns: vis_obs {ndarray} -- Visual observation if available, else None vec_obs {ndarray} -- Vector observation if available, else None reward {float} -- Reward signal from the environment done {bool} -- Whether the episode terminated or not """ # Determine if the episode terminated or not if len(terminal_info) == 0: done = False use_info = info else: done = True use_info = terminal_info # Process visual observations if self.visual_observation_space is not None: vis_obs = use_info.obs[self._vis_obs_index][0] else: vis_obs = None # Process vector observations if self.vector_observation_space is not None: for i, dim in enumerate(self._vec_obs_indices): if i == 0: vec_obs = use_info.obs[dim][0] else: vec_obs = np.concatenate((vec_obs, use_info.obs[dim][0])) else: vec_obs = None return vis_obs, vec_obs, use_info.reward[0], done def _verify_environment(self, num_agents = None): """Checks if the environment meets the requirements of this wrapper. Only one agent and at maximum one visual observation is allowed. Only Discrete and MultiDiscrete action spaces are supported. Arguments: num_agents {int} -- Number of agents (default: {None}) """ # Verify number of agent types if len(self._env.behavior_specs) != 1: raise UnityEnvironmentException("The unity environment containts more than one agent type.") # Verify action space type if int(self._behavior_spec.action_type.value) == 1: raise UnityEnvironmentException("Continuous action spaces are not supported. Only discrete and MultiDiscrete spaces are supported.") # Verify number of visual observations if self._num_vis_obs > 1: raise UnityEnvironmentException("The unity environment contains more than one visual observation.") # Verify agent count if num_agents is not None and num_agents > 1: raise UnityEnvironmentException("The unity environment contains more than one agent.")
if __name__ == "__main__": # Parsen der Parameterwerte bei Start des Programms parser = argparse.ArgumentParser() parser.add_argument("--cuda", default=False, action='store_true', help='Enable CUDA') parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") save_path = os.path.join("saves", "d4pg-" + args.name) os.makedirs(save_path, exist_ok=True) # Wrappen der Unity-Umgebung in eine Gym-Umgebung channel = EngineConfigurationChannel() unity_env = UnityEnvironment(ENV_ID, seed=1, side_channels=[channel]) channel.set_configuration_parameters(time_scale=20.0) env = UnityToGymWrapper(unity_env) # Erstellen des Modells nach der D4PG-Architektur act_net = model.D4PGActor(env.observation_space.shape[0], env.action_space.shape[0]).to(device) crt_net = model.D4PGCritic(env.observation_space.shape[0], env.action_space.shape[0], N_ATOMS, Vmin, Vmax).to(device) print(act_net) print(crt_net) tgt_act_net = ptan.agent.TargetNet(act_net) tgt_crt_net = ptan.agent.TargetNet(crt_net) # Erstellen des Agenten mit der PTAN-Bibliothek und des Buffers writer = SummaryWriter(comment="-d4pg_" + args.name) agent = model.AgentD4PG(act_net, device=device) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA, steps_count=REWARD_STEPS) buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=REPLAY_SIZE)
class UnityWrapperProcess: def __init__(self, conn: multiprocessing.connection.Connection = None, train_mode=True, file_name=None, worker_id=0, base_port=5005, no_graphics=True, seed=None, scene=None, additional_args=None, n_agents=1): """ Args: train_mode: If in train mode, Unity will speed up file_name: The executable path. The UnityEnvironment will run in editor if None worker_id: Offset from base_port base_port: The port that communicate to Unity. It will be set to 5004 automatically if in editor. no_graphics: If Unity runs in no graphic mode. It must be set to False if Unity has camera sensor. seed: Random seed scene: The scene name n_agents: The agents count """ self.scene = scene self.n_agents = n_agents seed = seed if seed is not None else np.random.randint(0, 65536) additional_args = [] if additional_args is None else additional_args.split( ' ') self.engine_configuration_channel = EngineConfigurationChannel() self.environment_parameters_channel = EnvironmentParametersChannel() self.environment_parameters_channel.set_float_parameter( 'env_copys', float(n_agents)) if conn: try: from algorithm import config_helper config_helper.set_logger() except: pass self._logger = logging.getLogger( f'UnityWrapper.Process_{os.getpid()}') else: self._logger = logging.getLogger('UnityWrapper.Process') self._env = UnityEnvironment( file_name=file_name, worker_id=worker_id, base_port=base_port if file_name else None, no_graphics=no_graphics and train_mode, seed=seed, additional_args=['--scene', scene] + additional_args, side_channels=[ self.engine_configuration_channel, self.environment_parameters_channel ]) self.engine_configuration_channel.set_configuration_parameters( width=200 if train_mode else 1280, height=200 if train_mode else 720, quality_level=5, time_scale=20 if train_mode else 1) self._env.reset() self.bahavior_name = list(self._env.behavior_specs)[0] if conn: try: while True: cmd, data = conn.recv() if cmd == INIT: conn.send(self.init()) elif cmd == RESET: conn.send(self.reset(data)) elif cmd == STEP: conn.send(self.step(*data)) elif cmd == CLOSE: self.close() except: self._logger.error(traceback.format_exc()) def init(self): """ Returns: observation shapes: tuple[(o1, ), (o2, ), (o3_1, o3_2, o3_3), ...] discrete action size: int, sum of all action branches continuous action size: int """ behavior_spec = self._env.behavior_specs[self.bahavior_name] obs_names = [o.name for o in behavior_spec.observation_specs] self._logger.info(f'Observation names: {obs_names}') obs_shapes = [o.shape for o in behavior_spec.observation_specs] self._logger.info(f'Observation shapes: {obs_shapes}') self._empty_action = behavior_spec.action_spec.empty_action discrete_action_size = 0 if behavior_spec.action_spec.discrete_size > 0: discrete_action_size = 1 action_product_list = [] for action, branch_size in enumerate( behavior_spec.action_spec.discrete_branches): discrete_action_size *= branch_size action_product_list.append(range(branch_size)) self._logger.info( f"Discrete action branch {action} has {branch_size} different actions" ) self.action_product = np.array( list(itertools.product(*action_product_list))) continuous_action_size = behavior_spec.action_spec.continuous_size self._logger.info(f'Continuous action size: {continuous_action_size}') self.d_action_size = discrete_action_size self.c_action_size = continuous_action_size for o in behavior_spec.observation_specs: if len(o.shape) >= 3: self.engine_configuration_channel.set_configuration_parameters( quality_level=5) break return obs_shapes, discrete_action_size, continuous_action_size def reset(self, reset_config=None): """ return: observations: list[(NAgents, o1), (NAgents, o2), (NAgents, o3_1, o3_2, o3_3)] """ reset_config = {} if reset_config is None else reset_config for k, v in reset_config.items(): self.environment_parameters_channel.set_float_parameter( k, float(v)) self._env.reset() decision_steps, terminal_steps = self._env.get_steps( self.bahavior_name) return [obs.astype(np.float32) for obs in decision_steps.obs] def step(self, d_action, c_action): """ Args: d_action: (NAgents, discrete_action_size), one hot like action c_action: (NAgents, continuous_action_size) Returns: observations: list[(NAgents, o1), (NAgents, o2), (NAgents, o3_1, o3_2, o3_3)] rewards: (NAgents, ) done: (NAgents, ), np.bool max_step: (NAgents, ), np.bool """ if self.d_action_size: d_action = np.argmax(d_action, axis=1) d_action = self.action_product[d_action] self._env.set_actions( self.bahavior_name, ActionTuple(continuous=c_action, discrete=d_action)) self._env.step() decision_steps, terminal_steps = self._env.get_steps( self.bahavior_name) tmp_terminal_steps = terminal_steps while len(decision_steps) == 0: self._env.set_actions(self.bahavior_name, self._empty_action(0)) self._env.step() decision_steps, terminal_steps = self._env.get_steps( self.bahavior_name) tmp_terminal_steps.agent_id = np.concatenate( [tmp_terminal_steps.agent_id, terminal_steps.agent_id]) tmp_terminal_steps.reward = np.concatenate( [tmp_terminal_steps.reward, terminal_steps.reward]) tmp_terminal_steps.interrupted = np.concatenate( [tmp_terminal_steps.interrupted, terminal_steps.interrupted]) reward = decision_steps.reward reward[tmp_terminal_steps.agent_id] = tmp_terminal_steps.reward done = np.full([ len(decision_steps), ], False, dtype=np.bool) done[tmp_terminal_steps.agent_id] = True max_step = np.full([ len(decision_steps), ], False, dtype=np.bool) max_step[tmp_terminal_steps.agent_id] = tmp_terminal_steps.interrupted return ([obs.astype(np.float32) for obs in decision_steps.obs], decision_steps.reward.astype(np.float32), done, max_step) def close(self): self._env.close() self._logger.warning(f'Process {os.getpid()} exits')
import time import random import numpy as np from collections import deque import torch.optim as optim from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel import os your_name = 'Veronica' np.set_printoptions(precision=3) channel = EngineConfigurationChannel() env = UnityEnvironment(file_name='./env/football', side_channels=[channel]) channel.set_configuration_parameters(time_scale=0.25) env.reset() load_tensor = False two_striker = False num_episodes = 500 num_game_each_episode = 1000 epsilon = 1.0 epsilon_min = 0.05 epsilon_decay = 0.99 scores = [[], []] best_avg_score = 0 scores_average_window = 5 striker_solved_score = 1 defender_solved_score = -1
l1_size = 16 # number of neurons in 1st layer l2_size = 32 # number of neurons in 2nd layer POP_SIZE = 12 # population size if __name__ == "__main__": try: # This is a non-blocking call that only loads the environment. print( "Script started. Please start Unity environment to start training proccess." ) engine_channel = EngineConfigurationChannel() # env = UnityEnvironment( side_channels=[engine_channel]) env = default_registry["3DBall"].make(side_channels=[engine_channel]) engine_channel.set_configuration_parameters( time_scale=1, width=1920, height=1080) # control time scale 0.5 - half speed, 10. - 10x time # Start interacting with the environment. env.reset() # Info about our environment --------------------- print(f"number of behaviours: {len(list(env.behavior_specs) )}") behavior_name = list(env.behavior_specs)[0] spec = env.behavior_specs[behavior_name] action_spec = spec.action_spec decision_steps, terminal_steps = env.get_steps(behavior_name) # Examine the number of observations per Agent print("Number of observations : ", len(spec.observation_shapes)) print(" observations : ", spec.observation_shapes) # Is the Action continuous or multi-discrete ? if action_spec.is_continuous(): print("The action is continuous")
def run_unity(self): from mlagents_envs.environment import UnityEnvironment from mlagents_envs.side_channel.engine_configuration_channel import EngineConfig, EngineConfigurationChannel if self.random_risk: path = 'runs/dqpg_td3_' + env_set[ 'env_name'] + '_' + "{:1.1f}".format( self.risk_factor) + "_noisy" else: path = 'runs/dqpg_td3_' + env_set[ 'env_name'] + '_' + "{:1.1f}".format(self.risk_factor) writer = SummaryWriter(path) epsilon = 1 self.ep = 0 self.dynamic_risk = self.risk_factor engine_configuration_channel = EngineConfigurationChannel() env = UnityEnvironment(file_name="env/" + env_set['env_name'], no_graphics=True, side_channels=[engine_configuration_channel]) env.reset() group_name = list(env.behavior_specs.keys())[0] group_spec = env.behavior_specs[group_name] engine_configuration_channel.set_configuration_parameters( time_scale=12.0) dec, term = env.get_steps(group_name) step = 0 scores = np.zeros([self.worker_size]) score = deque(maxlen=10) end_ep = env_set['ep_len'] train_start_ep = 5 vs = [] ps = [] #for i in range(1000*end_ep) while True: self.ep += 1 if epsilon > 0.05: epsilon = -self.ep * 2.0 / end_ep + 1.0 for i in range(1000): step += 1 if self.ep > train_start_ep: actions = self.get_action(dec.obs[0], epsilon, len(dec.agent_id)) else: actions = 2.0 * np.random.rand(len(dec.agent_id), self.output_size) - 1.0 env.set_actions(group_name, actions) if len(dec.agent_id) > 0: old_dec = dec old_action = actions action_idx = old_dec.agent_id env.step() dec, term = env.get_steps(group_name) for idx in term.agent_id: state = old_dec[idx].obs[0] next_state = term[idx].obs[0] reward = term[idx].reward done = not term[idx].max_step act = old_action[idx] self.memory.append(state, next_state, reward, done, act) scores[idx] += reward score.append(scores[idx]) scores[idx] = 0 for idx in dec.agent_id: if idx in term.agent_id: continue state = old_dec[idx].obs[0] next_state = dec[idx].obs[0] reward = dec[idx].reward done = False act = old_action[idx] self.memory.append(state, next_state, reward, done, act) scores[idx] += reward if self.ep > train_start_ep: if step % 2 == 0: v, p = self.update() vs.append(v) ps.append(p) else: v = self.value_update() vs.append(v) print('episode :', self.ep, '| score : ', "{0:.2f}".format(np.mean(score)), "[{0:.1f}]".format(np.std(score)), '| epsilon :', "{0:.2f}".format(epsilon), " | v :", "{0:.2f}".format(np.mean(vs)), " | p :", "{0:.2f}".format(np.mean(ps))) if self.ep < end_ep: writer.add_scalar('data/reward', np.mean(score), self.ep) writer.add_scalar('data/reward_std', np.std(score), self.ep) writer.add_scalar('data/epsilon', epsilon, self.ep) writer.add_scalar('data/memory_size', len(self.memory.memory), self.ep) writer.add_scalar('loss/value', np.mean(vs), self.ep) writer.add_scalar('loss/policy', np.mean(ps), self.ep) writer.add_scalar('data/risk_factor', self.dynamic_risk, self.ep) vs.clear() ps.clear() else: break
def test_run_environment(env_name): """ Run the low-level API test using the specified environment :param env_name: Name of the Unity environment binary to launch """ engine_configuration_channel = EngineConfigurationChannel() env = UnityEnvironment( file_name=env_name, side_channels=[engine_configuration_channel], no_graphics=True, additional_args=["-logFile", "-"], ) try: # Reset the environment env.reset() # Set the default brain to work with group_name = list(env.behavior_specs.keys())[0] group_spec = env.behavior_specs[group_name] # Set the time scale of the engine engine_configuration_channel.set_configuration_parameters( time_scale=3.0) # Get the state of the agents decision_steps, terminal_steps = env.get_steps(group_name) # Examine the number of observations per Agent print("Number of observations : ", len(group_spec.sensor_specs)) # Is there a visual observation ? vis_obs = any( len(sen_spec.shape) == 3 for sen_spec in group_spec.sensor_specs) print("Is there a visual observation ?", vis_obs) # Examine the state space for the first observation for the first agent print("First Agent observation looks like: \n{}".format( decision_steps.obs[0][0])) for _episode in range(10): env.reset() decision_steps, terminal_steps = env.get_steps(group_name) done = False episode_rewards = 0 tracked_agent = -1 while not done: action_tuple = group_spec.action_spec.random_action( len(decision_steps)) if tracked_agent == -1 and len(decision_steps) >= 1: tracked_agent = decision_steps.agent_id[0] env.set_actions(group_name, action_tuple) env.step() decision_steps, terminal_steps = env.get_steps(group_name) done = False if tracked_agent in decision_steps: episode_rewards += decision_steps[tracked_agent].reward if tracked_agent in terminal_steps: episode_rewards += terminal_steps[tracked_agent].reward done = True print(f"Total reward this episode: {episode_rewards}") finally: env.close()