def test_determinism(self): task = generate_task(task_generator_id="stacked_blocks") observations_v1 = [] observations_v2 = [] observations_v3 = [] rewards_v1 = [] rewards_v2 = [] rewards_v3 = [] horizon = 30 env_v1 = CausalWorld(task=task, enable_visualization=False, seed=27) obs = env_v1.reset() observations_v1.append(obs) for _ in range(horizon): obs, reward, done, info = env_v1.step(env_v1.action_space.low) observations_v1.append(obs) rewards_v1.append(reward) env_v1.close() task = generate_task(task_generator_id="stacked_blocks") env_v2 = CausalWorld(task=task, enable_visualization=False, seed=27) obs = env_v2.reset() observations_v2.append(obs) for _ in range(horizon): obs, reward, done, info = env_v2.step(env_v2.action_space.low) observations_v2.append(obs) rewards_v2.append(reward) env_v2.close() task = generate_task(task_generator_id="stacked_blocks") env_v3 = CausalWorld(task=task, enable_visualization=False, seed=54) obs = env_v3.reset() observations_v3.append(obs) for _ in range(horizon): obs, reward, done, info = env_v3.step(env_v3.action_space.low) observations_v3.append(obs) rewards_v3.append(reward) env_v3.close() assert all( np.array_equal(observations_v1[i], observations_v2[i]) for i in range(horizon)) assert rewards_v1 == rewards_v2 assert all( np.array_equal(observations_v1[i], observations_v3[i]) for i in range(horizon)) assert rewards_v1 == rewards_v3
def test_parallelism(self): task = generate_task(task_generator_id="stacked_blocks") env1 = CausalWorld(task=task, enable_visualization=False, seed=0) env1.reset() task2 = generate_task(task_generator_id="stacked_blocks") env2 = CausalWorld(task=task2, enable_visualization=False, seed=0) observations_env1_v1, rewards_env1_v1, _, _ = env1.step( env1.action_space.low) env2.reset() observations_env2_v1, rewards_env2_v1, _, _ = env2.step( env2.action_space.low) env1.close() env2.close() assert np.array_equal(observations_env2_v1, observations_env1_v1) return
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, sac_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) set_global_seeds(seed_num) policy_kwargs = dict(layers=[256, 256]) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = SAC(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs, **sac_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="sac", callback=checkpoint_callback) return
def simulate_policy(): task = generate_task(task_generator_id='picking') env = CausalWorld(task=task, enable_visualization=True, skip_frame=3, seed=0, max_episode_length=600) env = GymEnvWrapper(env) file = './itr_1097499.pkl' data = torch.load(file) agent_state_dict = data['agent_state_dict'] agent = SacAgent(initial_model_state_dict=agent_state_dict) agent.initialize(env_spaces=env.spaces) agent.eval_mode(itr=data['itr']) def policy_func(obs): # new_obs = np.hstack((obs['observation'], obs['desired_goal'])) agent_info = agent.step(torchify_buffer(obs), prev_action=None, prev_reward=None) return agent_info.action.numpy() # env = HERGoalEnvWrapper(env) for _ in range(100): total_reward = 0 o = env.reset() for _ in range(600): o, reward, done, info = env.step(policy_func(o)) total_reward += reward print("total reward is :", total_reward) env.close()
def example(): # This tutorial shows how to view policies of trained actors task = generate_task(task_generator_id='picking') world_params = dict() world_params["skip_frame"] = 3 world_params["seed"] = 0 stable_baselines_policy_path = "./model_2000000_steps.zip" model = SAC.load(stable_baselines_policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # # Record a video of the policy is done in one line viewer.record_video_of_policy(task=task, world_params=world_params, policy_fn=policy_fn, file_name="pushing_video", number_of_resets=10, max_time_steps=10 * 100) # Similarly for interactive visualization in the GUI viewer.view_policy(task=task, world_params=world_params, policy_fn=policy_fn, max_time_steps=40 * 600, number_of_resets=40)
def simulate_policy(): file = './her-sac-fetch-experiment/her-sac-fetch-experiment_2020_07_07_11_11_14_0000--s-0/params.pkl' data = torch.load(file) policy = data['evaluation/policy'] policy.reset() def policy_func(obs): # new_obs = np.hstack((obs['observation'], obs['desired_goal'])) a, agent_info = policy.get_action(obs) return a task = generate_task(task_generator_id='reaching') env = CausalWorld(task=task, enable_visualization=True, skip_frame=1, seed=0, max_episode_length=2500) env = CurriculumWrapper(env, intervention_actors=[GoalInterventionActorPolicy()], actives=[(0, 1000000000, 1, 0)]) # env = HERGoalEnvWrapper(env) for _ in range(100): total_reward = 0 o = env.reset() for _ in range(2500): o, reward, done, info = env.step(policy_func(o)) total_reward += reward print("total reward is :", total_reward) env.close()
def example(): # This tutorial shows how to view policies of trained actors task = generate_task(task_generator_id='pick_and_place') world_params = dict() world_params["skip_frame"] = 3 world_params["seed"] = 0 stable_baselines_policy_path = "./model_100000000_steps.zip" model = PPO2.load(stable_baselines_policy_path) # define a method for the policy fn of your trained model def policy_fn(obs): return model.predict(obs, deterministic=True)[0] # Similarly for interactive visualization in the GUI viewer.view_policy(task=task, world_params=world_params, policy_fn=policy_fn, max_time_steps=40 * 600, number_of_resets=40, env_wrappers=[CurriculumWrapper], env_wrappers_args=[{ 'intervention_actors': [GoalInterventionActorPolicy()], 'actives': [(0, 1000000000, 1, 0)] }])
def example(): #initialize env task_gen = generate_task(task_generator_id='pushing') env = CausalWorld(task_gen, skip_frame=10, enable_visualization=True) # define a custom curriculum of interventions: # No intervention actor is defined until episode number 5 # Goal intervention actor from episode number 5 to 10 after reset at time step 0 # Visual intervention actor from episode number 10 to 20 every two episodes after reset at time step 0 # Random intervention actor from episode number 20 to 25 after reset at time step 0 # Goal intervention actor from episode number 25 to 30 each at time step 50 env = CurriculumWrapper(env, intervention_actors=[ GoalInterventionActorPolicy(), VisualInterventionActorPolicy(), RandomInterventionActorPolicy(), GoalInterventionActorPolicy() ], actives=[(5, 10, 1, 0), (10, 20, 2, 0), (20, 25, 1, 0), (25, 30, 1, 50)]) for reset_idx in range(30): obs = env.reset() for time in range(100): desired_action = env.action_space.sample() obs, reward, done, info = env.step(action=desired_action) env.close()
def train_policy(num_of_envs, log_relative_path, maximum_episode_length, skip_frame, seed_num, her_config, total_time_steps, validate_every_timesteps, task_name): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array([100000, 0, 0, 0]), fractional_reward_weight=0) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num, max_episode_length=maximum_episode_length) env = HERGoalEnvWrapper(env) env = CurriculumWrapper( env, intervention_actors=[GoalInterventionActorPolicy()], actives=[(0, 1000000000, 1, 0)]) set_global_seeds(seed_num) checkpoint_callback = CheckpointCallback(save_freq=int( validate_every_timesteps / num_of_envs), save_path=log_relative_path, name_prefix='model') model = HER(MlpPolicy, env, SAC, verbose=1, policy_kwargs=dict(layers=[256, 256, 256]), **her_config, seed=seed_num) model.learn(total_timesteps=total_time_steps, tb_log_name="her_sac", callback=checkpoint_callback) return
def setUp(self): self.task = generate_task(task_generator_id="reaching") self.env = CausalWorld(task=self.task, enable_visualization=False, action_mode='joint_positions', normalize_observations=False, normalize_actions=False) return
def _init(): task = generate_task(task_generator_id=task_name) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) return env
def setUp(self): self.task = generate_task(task_generator_id="picking") self.env = CausalWorld(task=self.task, enable_visualization=False, skip_frame=1, action_mode="end_effector_positions", normalize_actions=False, normalize_observations=False) return
def _make_env(rank): task = generate_task(task_generator_id='reaching') env = CausalWorld(task=task, skip_frame=10, enable_visualization=False, seed=0 + rank, max_episode_length=600) env = GymEnvWrapper(env) return env
def test_timing_profile(self): from pybullet_envs.bullet.kukaGymEnv import KukaGymEnv import time kuka_env = KukaGymEnv(renders=False, isDiscrete=False) # operates at 240 HZ task = generate_task(task_generator_id="pushing") causal_rl_env = CausalWorld( task=task, enable_visualization=False, seed=0, skip_frame=10, normalize_actions=False, normalize_observations=False) # operates at 250 HZ start = time.time() kuka_env.reset() end = time.time() kuka_reset_time = end - start start = time.time() causal_rl_env.reset() end = time.time() causal_rl_reset_time = end - start self.assertLess(causal_rl_reset_time, kuka_reset_time * 1.25) start = time.time() kuka_env.step(kuka_env.action_space.sample()) end = time.time() kuka_step_time = end - start start = time.time() causal_rl_env.step(causal_rl_env.action_space.sample()) end = time.time() causal_rl_step_time = end - start print("time 1", causal_rl_step_time) print("time 2", kuka_step_time) self.assertLess(causal_rl_step_time, kuka_step_time * 10) start = time.time() kuka_env.render() end = time.time() kuka_render_time = end - start start = time.time() causal_rl_env.render() end = time.time() causal_rl_render_time = end - start self.assertLess(causal_rl_render_time, kuka_render_time * 1.25) causal_rl_env.close() kuka_env.close() return
def goal_interventions(): task = generate_task(task_generator_id='stacked_blocks') env = CausalWorld(task=task, enable_visualization=True) env.reset() for _ in range(10): for i in range(200): obs, reward, done, info = env.step(env.action_space.sample()) goal_intervention_dict = env.sample_new_goal() print("new goal chosen: ", goal_intervention_dict) success_signal, obs = env.do_intervention(goal_intervention_dict) print("Goal Intervention success signal", success_signal) env.close()
def example(): task = generate_task(task_generator_id='picking') env = CausalWorld(task=task, enable_visualization=True) env.reset() for _ in range(50): random_intervention_dict, success_signal, obs = \ env.do_single_random_intervention() print("The random intervention performed is ", random_intervention_dict) for i in range(100): obs, reward, done, info = env.step(env.action_space.sample()) env.close()
def example(): # This tutorial shows how to view a random policy on the pyramid task task = generate_task(task_generator_id='picking') world_params = dict() world_params["skip_frame"] = 3 world_params["seed"] = 200 viewer.record_video_of_random_policy(task=task, world_params=world_params, file_name="picking_video", number_of_resets=1, max_time_steps=300)
def _init(): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) return env
def without_intervention_split(): task = generate_task(task_generator_id='pushing') env = CausalWorld(task=task, enable_visualization=True) env.reset() for _ in range(2): for i in range(200): obs, reward, done, info = env.step(env.action_space.sample()) success_signal, obs = env.do_intervention( {'stage_color': np.random.uniform(0, 1, [ 3, ])}) print("Intervention success signal", success_signal) env.close()
def example(): # Here you learn how to record/ log entire episodes into a directory # to reuse it later e.g. for reviewing logged episodes or using this # data for pre-training policies. # Construct a data_recorder that keeps track of every change in the environment # We set the recording dumb frequency of episodes into log_files to 11 (default is 100) data_recorder = DataRecorder(output_directory='pushing_episodes', rec_dumb_frequency=11) # Pass the data recorder to the World task = generate_task(task_generator_id='pushing') env = CausalWorld(task=task, enable_visualization=True, data_recorder=data_recorder) # Record some episodes for _ in range(23): env.reset() for _ in range(50): env.step(env.action_space.sample()) env.close() # Load the logged episodes data = DataLoader(episode_directory='pushing_episodes') episode = data.get_episode(14) # Initialize a new environment according a specific episode and replay it task = generate_task(episode.task_name, **episode.task_params) env = CausalWorld(task, **episode.world_params, enable_visualization=True) env.set_starting_state(episode.initial_full_state, check_bounds=False) for action in episode.robot_actions: env.step(action) env.close() # You can achieve the same by using the viewer module in one line viewer.view_episode(episode)
def _make_env(rank): task = generate_task(task_generator_id='picking', dense_reward_weights=np.array( [250, 0, 125, 0, 750, 0, 0, 0.005]), fractional_reward_weight=1, goal_height=0.15, tool_block_mass=0.02) env = CausalWorld(task=task, skip_frame=3, enable_visualization=False, seed=0, max_episode_length=600) env = GymEnvWrapper(env) return env
def get_world(task_generator_id, task_params, world_params, enable_visualization=False, env_wrappers=np.array([]), env_wrappers_args=np.array([])): """ Returns a particular CausalWorld instance with optional wrappers :param task_generator_id: (str) id of the task of the environment :param task_params: (dict) task params of the environment :param world_params: (dict) world_params of the environment :param enable_visualization: (bool) if GUI visualization is enabled :param env_wrappers: (list) a list of gym wrappers :param env_wrappers_args: (list) a list of kwargs for the gym wrappers :return: (CausalWorld) a CausalWorld environment instance """ world_params["skip_frame"] = 1 if task_params is None: task = generate_task(task_generator_id) else: if "task_name" in task_params: del task_params["task_name"] task = generate_task(task_generator_id, **task_params) if "enable_visualization" in world_params.keys(): world_params_temp = dict(world_params) del world_params_temp["enable_visualization"] env = CausalWorld(task, **world_params_temp, enable_visualization=enable_visualization) else: env = CausalWorld(task, **world_params, enable_visualization=enable_visualization) for i in range(len(env_wrappers)): env = env_wrappers[i](env, **env_wrappers_args[i]) return env
def _make_env(rank): task = generate_task('pushing', dense_reward_weights=np.array([2500, 2500, 0]), variables_space='space_a', fractional_reward_weight=100) env = CausalWorld(task=task, skip_frame=3, enable_visualization=False, seed=0 + rank) env = CurriculumWrapper( env, intervention_actors=[GoalInterventionActorPolicy()], actives=(0, 1e9, 2, 0)) env = GymEnvWrapper(env) return env
def end_effector_pos(): task = generate_task(task_generator_id='reaching') env = CausalWorld(task=task, enable_visualization=True, action_mode="joint_positions", normalize_actions=False, normalize_observations=False) obs = env.reset() for _ in range(100): goal_dict = env.sample_new_goal() success_signal, obs = env.do_intervention(goal_dict) obs, reward, done, info = env.step(control_policy(env, obs)) for _ in range(250): obs, reward, done, info = env.step(control_policy(env, obs)) env.close()
def example(): task = generate_task(task_generator_id='picking') env = CausalWorld(task=task, enable_visualization=True) env.set_starting_state( {'goal_block': { 'cartesian_position': [0.1, 0.1, 0.1] }}) for _ in range(500): obs, reward, done, info = env.step(env.action_space.sample()) env.reset_default_state() for _ in range(500): obs, reward, done, info = env.step(env.action_space.sample()) env.reset() for _ in range(500): obs, reward, done, info = env.step(env.action_space.sample()) env.close()
def _init(): task = generate_task(task_generator_id=task_name, dense_reward_weights=np.array( [100000, 0, 0, 0]), fractional_reward_weight=0) env = CausalWorld(task=task, skip_frame=skip_frame, enable_visualization=False, seed=seed_num + rank, max_episode_length=maximum_episode_length) env = CurriculumWrapper( env, intervention_actors=[GoalInterventionActorPolicy()], actives=[(0, 1000000000, 1, 0)]) return env
def with_intervention_split_2(): task = generate_task(task_generator_id='pushing', variables_space='space_b') env = CausalWorld(task=task, enable_visualization=False) interventions_space = task.get_intervention_space_a() env.reset() for _ in range(2): for i in range(200): obs, reward, done, info = env.step(env.action_space.sample()) success_signal, obs = env.do_intervention({ 'stage_color': np.random.uniform(interventions_space['stage_color'][0], interventions_space['stage_color'][1]) }) print("Intervention success signal", success_signal) env.close()
def test_pd_gains(): #control the robot using pd controller np.random.seed(0) task = generate_task(task_generator_id='pushing') skip_frame = 1 env = CausalWorld(task=task, enable_visualization=False, skip_frame=skip_frame, normalize_observations=False, normalize_actions=False, seed=0) zero_hold = int(5000 / skip_frame) #reach desired position in 4 secs? obs = env.reset() #test bounds first for _ in range(zero_hold): chosen_action = np.zeros(9, ) obs, reward, done, info = env.step(chosen_action) current_joint_positions = obs[1:10] if (((current_joint_positions - chosen_action) > 0.1).any()): raise AssertionError( "The pd controller failed to reach these values {} but reached instead {}" .format(chosen_action, current_joint_positions)) for _ in range(zero_hold): chosen_action = env.action_space.high obs, reward, done, info = env.step(chosen_action) current_joint_positions = obs[1:10] if (((current_joint_positions - chosen_action) > 0.1).any()): raise AssertionError( "The pd controller failed to reach these values {} but reached instead {}" .format(chosen_action, current_joint_positions)) # for i in range(200): # #check for first finger # chosen_action = np.random.uniform(env.action_space.low, env.action_space.high) # chosen_action[3:] = env.action_space.low[3:] # chosen_action[1] = 0 # chosen_action[2] = 0 # for _ in range(zero_hold): # chosen_action = chosen_action # obs, reward, done, info = env.step(chosen_action) # current_joint_positions = obs[:9] # if(((current_joint_positions - chosen_action) > 0.1).any()): # raise AssertionError("The pd controller failed to reach these values {} but reached instead {}". # format(chosen_action, current_joint_positions)) env.close()
def example(): task = generate_task(task_generator_id='pick_and_place') env = CausalWorld(task=task, enable_visualization=True) env.reset() intervention_space = env.get_variable_space_used() for _ in range(100): for i in range(200): obs, reward, done, info = env.step(env.action_space.low) intervention = { 'tool_block': { 'size': np.random.uniform(intervention_space['tool_block']['size'][0], intervention_space['tool_block']['size'][1]) } } env.do_intervention(intervention) env.close()
def smooth_action(): task = generate_task(task_generator_id='reaching') env = CausalWorld(task=task, enable_visualization=True, action_mode="joint_positions", normalize_actions=True, normalize_observations=True, skip_frame=1) env = MovingAverageActionEnvWrapper(env) for _ in range(50): obs = env.reset() for _ in range(1000): desired_action = np.zeros([ 9, ]) obs, reward, done, info = env.step(desired_action) env.close()