def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pposgd_simple.learn( env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def test_ignore_mujoco_warnings(): # Two boxes on a plane need more than 1 contact (nconmax) xml = ''' <mujoco> <size nconmax="1"/> <worldbody> <geom type="plane" size="1 1 0.1"/> <body pos="1 0 1"> <joint type="free"/> <geom size="1"/> </body> <body pos="0 1 1"> <joint type="free"/> <geom size="1"/> </body> </worldbody> </mujoco> ''' model = load_model_from_xml(xml) sim = MjSim(model) sim.reset() with ignore_mujoco_warnings(): # This should raise an exception due to the mujoco warning callback, # but it's suppressed by the context manager. sim.step() sim.reset() with pytest.raises(Exception): # test to make sure previous warning callback restored. sim.step()
def test_ignore_mujoco_warnings(): # Two boxes on a plane need more than 1 contact (nconmax) xml = ''' <mujoco> <size nconmax="1"/> <worldbody> <geom type="plane" size="1 1 0.1"/> <body pos="1 0 1"> <joint type="free"/> <geom size="1"/> </body> <body pos="0 1 1"> <joint type="free"/> <geom size="1"/> </body> </worldbody> </mujoco> ''' model = load_model_from_xml(xml) sim = MjSim(model) sim.reset() with ignore_mujoco_warnings(): # This should raise an exception due to the mujoco warning callback, # but it's suppressed by the context manager. sim.step() sim.reset() with pytest.raises(Exception): # test to make sure previous warning callback restored. sim.step()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Robotics environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() with mujoco_py.ignore_mujoco_warnings(): workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3, sess=sess, placeholders=placeholders) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') env.close()
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Robotics environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() with mujoco_py.ignore_mujoco_warnings(): workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) print(env.observation_space) tblog = "/cvgl2/u/surajn/workspace/tb_logs/ppo1_fetchreach/" model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', tensorboard_log=tblog, verbose=1) model.learn(total_timesteps=num_timesteps) env.close()
def test_locked_cube(): env = make_env(starting_seed=0) is_on_palm = [] for idx in range(20): env.reset() expected_joints = ( "cube:cube_tx", "cube:cube_ty", "cube:cube_tz", "cube:cube_rot", "target:cube_tx", "target:cube_ty", "target:cube_tz", "target:cube_rot", "robot0:WRJ1", "robot0:WRJ0", "robot0:FFJ3", "robot0:FFJ2", "robot0:FFJ1", "robot0:FFJ0", "robot0:MFJ3", "robot0:MFJ2", "robot0:MFJ1", "robot0:MFJ0", "robot0:RFJ3", "robot0:RFJ2", "robot0:RFJ1", "robot0:RFJ0", "robot0:LFJ4", "robot0:LFJ3", "robot0:LFJ2", "robot0:LFJ1", "robot0:LFJ0", "robot0:THJ4", "robot0:THJ3", "robot0:THJ2", "robot0:THJ1", "robot0:THJ0", ) assert env.unwrapped.sim.model.joint_names == expected_joints with ignore_mujoco_warnings(): for _ in range(20): obs, _, _, _ = env.step(env.action_space.nvec // 2) is_on_palm.append(on_palm(env.unwrapped.sim)) # Make sure the mass is right. cube_body_idx = env.unwrapped.sim.model.body_name2id("cube:middle") assert_allclose( env.unwrapped.sim.model.body_subtreemass[cube_body_idx], 0.078, atol=1e-3) assert ( np.mean(is_on_palm) >= 0.8 ), "Cube should stay in hand (most of the time) when zero action is sent."
def _worker(remote, parent_remote, env_fn_wrapper, env_memory_usage=None): with mujoco_py.ignore_mujoco_warnings(): process = psutil.Process(os.getpid()) parent_remote.close() env = env_fn_wrapper.var() while True: try: cmd, data = remote.recv() #print("CMD", cmd, os.getpid()) if cmd == 'step': #print("step start", os.getpid()) observation, reward, done, info = env.step(data) #print("step end", os.getpid()) if done: #print("done", os.getpid()) # save final observation where user can get it, then reset info['terminal_observation'] = observation observation = env.reset() #print("reset", os.getpid()) #print("send", os.getpid()) remote.send((observation, reward, done, info)) #print("sent", os.getpid()) elif cmd == 'seed': remote.send(env.seed(data)) elif cmd == 'reset': observation = env.reset() remote.send(observation) if not env_memory_usage is None: env_memory_usage.value = process.memory_info( ).rss / 1E9 #print("Worker memory usage", process.memory_info().rss/1E9) elif cmd == 'render': remote.send(env.render(data)) elif cmd == 'close': env.close() remote.close() break elif cmd == 'get_spaces': remote.send((env.observation_space, env.action_space)) elif cmd == 'env_method': method = getattr(env, data[0]) remote.send(method(*data[1], **data[2])) elif cmd == 'get_attr': remote.send(getattr(env, data)) elif cmd == 'set_attr': remote.send(setattr(env, data[0], data[1])) else: raise NotImplementedError( "`{}` is not implemented in the worker".format(cmd)) except: "Got some warning stuff, continuar, no throw you fool"
def run(self, once=False): while True: self.process_events() self.update_sim(self.env.unwrapped.sim) self.add_extra_menu() with ignore_mujoco_warnings(): obs, reward, done, info = self._run_step(self._get_action()) self.last_step_result = deepcopy((obs, reward, done, info)) self.add_overlay(const.GRID_BOTTOMRIGHT, "done", str(done)) self.render() self.update_aux_display() if once: return
def run(self, once=False): while True: with ignore_mujoco_warnings(): self.env.step(self.action) self.add_overlay(const.GRID_TOPRIGHT, "Reset env; (current seed: {})".format(self.seed), "N - next / P - previous ") self.add_overlay(const.GRID_TOPRIGHT, "Apply action", "A (-0.05) / Z (+0.05)") self.add_overlay( const.GRID_TOPRIGHT, "on action index %d out %d" % (self.action_mod_index, self.num_action), "J / K") self.add_overlay( const.GRID_BOTTOMRIGHT, "Reset took", "%.2f sec." % (sum(self.elapsed) / len(self.elapsed))) self.add_overlay(const.GRID_BOTTOMRIGHT, "Action", str(self.action)) self.render() if once: return
def fit(environ, env_id, num_timesteps, seed, model_path=None): # atari if environ == 'atari': rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \ is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pi = PPOSGD(env, policy_fn, env.observation_space, env.action_space, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, max_timesteps=int(num_timesteps * 1.1), schedule='linear') env.close() sess.close() return pi # mujoco if environ == 'mujoco': from utils.cmd import make_mujoco_env sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi if environ == 'humanoid': import gym from utils.cmd import make_mujoco_env env_id = 'Humanoid-v2' class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale def reward(self, r): return r * self.scale sess = Model().init_session(num_cpu=1).__enter__() def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = make_mujoco_env(env_id, seed) # parameters below were the best found in a simple random # search these are good enough to make humanoid walk, but # whether those are an absolute best or not is not certain env = RewScale(env, 0.1) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close() if model_path: Model().save_state(model_path) sess.close() return pi if environ == 'robotics': import mujoco_py from utils.cmd import make_robotics_env rank = MPI.COMM_WORLD.Get_rank() sess = Model().single_threaded_session() sess.__enter__() mujoco_py.ignore_mujoco_warnings().__enter__() workerseed = seed + 10000 * rank set_global_seeds(workerseed) env = make_robotics_env(env_id, workerseed, rank=rank) def policy_fn(name, ob_space, ac_space): return PPO1Mlp(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=256, num_hid_layers=3) pi = PPOSGD( env, policy_fn, env.observation_space, env.action_space, max_timesteps=num_timesteps, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear', ) env.close() sess.close() return pi