def main(): register_all_envs() # env = PickAndPlaceEnv( # # Environment dynamics # action_scale=1.0, # boundary_dist=4, # ball_radius=1.5, # object_radius=1., # ball_visual_radius=1.5, # object_visual_radius=1., # min_grab_distance=1., # walls=None, # # Rewards # action_l2norm_penalty=0, # reward_type="dense", # success_threshold=0.60, # # Reset settings # fixed_goal=None, # # Visualization settings # images_are_rgb=True, # render_dt_msec=0, # render_onscreen=False, # render_size=84, # show_goal=False, # goal_samplers=None, # goal_sampling_mode='random', # num_presampled_goals=10000, # object_reward_only=False, # # init_position_strategy='random', # num_objects=1, # ) env = gym.make('OneObject-PickAndPlace-BigBall-RandomInit-2D-v1') renderer = EnvRenderer( output_image_format='CHW', width=28, height=28, ) import cv2 from PIL import Image n = 12800 imgs = [] for _ in range(n): env.reset() img = renderer(env) # cv2.imshow('img', img.transpose()) # cv2.waitKey(100) imgs.append(img) imgs = np.array(imgs) np.save( '/home/vitchyr/mnt/log/manual-upload/sets/OneObject-PickAndPlace-BigBall-RandomInit-2D-v1-ungrouped-train-28x28.npy', imgs, )
def __init__(self): multiworld.register_all_envs() self.internal_world = gym.make('HalfCheetahGoal-v0') self.max_speed = 6 self.action_space = self.internal_world.action_space self.goal_space = self.internal_world.goal_space self.observation_space = self.internal_world.obs_space self.distance_threshold = 0.1 self.seed() self.reset()
def get_gym_env(env_id, env_class=None, env_kwargs=None): if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: env = env_class(**env_kwargs) return env
def __init__(self): multiworld.register_all_envs() self.internal_world = gym.make('AntXY-v0') self.internal_world.include_contact_forces_in_state = False self.distance_threshold = 0.1 self.goal_limit = 2 self.goal_low = self.goal_limit * np.ones(2) self.goal_high = self.goal_limit * np.ones(2) self.goal_ = None self.goal_space = Box(self.goal_low, self.goal_high) self.action_space = self.internal_world.action_space self.observation_space_low = np.array([-np.inf for _ in range(29)]) self.observation_space_high = np.array([np.inf for _ in range(29)]) self.observation_space = Box(self.observation_space_low, self.observation_space_high) self.seed() self.reset()
def get_gym_env( env_id, env_class=None, env_kwargs=None, unwrap_timed_envs=False, ): if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: env = env_class(**env_kwargs) if isinstance(env, TimeLimit) and unwrap_timed_envs: env = env.env return env
def train_bg(variant): import numpy as np invisiable_env_id = { 'SawyerPushNIPSEasy-v0': "SawyerPushNIPSPuckInvisible-v0", 'SawyerPushHurdle-v0': 'SawyerPushHurdlePuckInvisible-v0', 'SawyerPushHurdleMiddle-v0': 'SawyerPushHurdleMiddlePuckInvisible-v0', 'SawyerDoorHookResetFreeEnv-v1': 'SawyerDoorHookResetFreeEnvDoorInvisible-v0', 'SawyerPickupEnvYZEasy-v0': 'SawyerPickupResetFreeEnvBallInvisible-v0', } print("training opencv background model!") env_id = variant.get('env_id', None) env_id_invis = invisiable_env_id[env_id] import gym import multiworld from multiworld.core.image_env import ImageEnv multiworld.register_all_envs() obj_invisible_env = gym.make(env_id_invis) init_camera = variant.get('init_camera', None) presampled_goals = None if variant.get("presampled_goals_path") is not None: presampled_goals = np.load(variant['presampled_goals_path'], allow_pickle=True).item() print("presampled goal path is: ", variant['presampled_goals_path']) # print("presampled goals are: ", presampled_goals) obj_invisible_env = ImageEnv( obj_invisible_env, variant.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, presampled_goals=presampled_goals, ) train_bgsb(obj_invisible_env)
def generate_vae_dataset(variant): env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', None) env_id = variant.get('env_id', None) N = variant.get('N', 10000) test_p = variant.get('test_p', 0.9) use_cached = variant.get('use_cached', True) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) show = variant.get('show', False) init_camera = variant.get('init_camera', None) dataset_path = variant.get('dataset_path', None) oracle_dataset_using_set_to_goal = variant.get( 'oracle_dataset_using_set_to_goal', False) random_rollout_data = variant.get('random_rollout_data', False) random_and_oracle_policy_data = variant.get( 'random_and_oracle_policy_data', False) random_and_oracle_policy_data_split = variant.get( 'random_and_oracle_policy_data_split', 0) policy_file = variant.get('policy_file', None) n_random_steps = variant.get('n_random_steps', 100) vae_dataset_specific_env_kwargs = variant.get( 'vae_dataset_specific_env_kwargs', None) save_file_prefix = variant.get('save_file_prefix', None) non_presampled_goal_img_is_garbage = variant.get( 'non_presampled_goal_img_is_garbage', None) tag = variant.get('tag', '') from multiworld.core.image_env import ImageEnv, unormalize_image import rlkit.torch.pytorch_util as ptu info = {} if dataset_path is not None: dataset = load_local_or_remote_file(dataset_path) N = dataset.shape[0] else: if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_random_oracle_split_{}{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera else '', imsize, random_and_oracle_policy_data_split, tag, ) if use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, non_presampled_goal_img_is_garbage= non_presampled_goal_img_is_garbage, ) else: imsize = env.imsize env.non_presampled_goal_img_is_garbage = non_presampled_goal_img_is_garbage env.reset() info['env'] = env if random_and_oracle_policy_data: policy_file = load_local_or_remote_file(policy_file) policy = policy_file['policy'] policy.to(ptu.device) if random_rollout_data: from rlkit.exploration_strategies.ou_strategy import OUStrategy policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) for i in range(10000): NP = [] if oracle_dataset_using_set_to_goal: print(i) #print('th step') goal = env.sample_goal() env.set_to_goal(goal) obs = env._get_obs() #img = img.reshape(3, imsize, imsize).transpose() # img = img[::-1, :, ::-1] # cv2.imshow('img', img) # cv2.waitKey(1) img_1 = obs['image_observation'] NP.append(img_1) #dataset[i, :] = unormalize_image(img) img_1 = img_1.reshape(3, imsize, imsize).transpose() if i % 3 == 0: cv2.imshow('img1', img_1) cv2.waitKey(1) env.reset() instr = env.generate_new_state(goal) if i % 3 == 0: print(instr) obs = env._get_obs() # obs = env._get_obs() img_2 = obs['image_observation'] NP.append(img_2) NP.append(instr) img_2 = img_2.reshape(3, imsize, imsize).transpose() if i % 3 == 0: cv2.imshow('img2', img_2) cv2.waitKey(1) NP = np.array(NP) print(NP) idx = str(i) name = "/home/xiaomin/Downloads/IFIG_DATA_1/" + idx + ".npy" np.save(open(name, 'wb'), NP) # radius = input('waiting...') print("done making training data", filename, time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def HER_baseline_td3_experiment(variant): import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.exploration_strategies.base import ( PolicyWrappedWithExplorationStrategy) from rlkit.torch.her.her_td3 import HerTd3 from rlkit.torch.networks import MergedCNN, CNNPolicy import torch from multiworld.core.image_env import ImageEnv from rlkit.misc.asset_loader import load_local_or_remote_file init_camera = variant.get("init_camera", None) presample_goals = variant.get('presample_goals', False) presampled_goals_path = get_presampled_goals_path( variant.get('presampled_goals_path', None)) if 'env_id' in variant: import gym import multiworld multiworld.register_all_envs() env = gym.make(variant['env_id']) else: env = variant["env_class"](**variant['env_kwargs']) image_env = ImageEnv( env, variant.get('imsize'), reward_type='image_sparse', init_camera=init_camera, transpose=True, normalize=True, ) if presample_goals: if presampled_goals_path is None: image_env.non_presampled_goal_img_is_garbage = True presampled_goals = variant['generate_goal_dataset_fctn']( env=image_env, **variant['goal_generation_kwargs']) else: presampled_goals = load_local_or_remote_file( presampled_goals_path).item() del image_env env = ImageEnv( env, variant.get('imsize'), reward_type='image_distance', init_camera=init_camera, transpose=True, normalize=True, presampled_goals=presampled_goals, ) else: env = image_env es = get_exploration_strategy(variant, env) observation_key = variant.get('observation_key', 'image_observation') desired_goal_key = variant.get('desired_goal_key', 'image_desired_goal') achieved_goal_key = desired_goal_key.replace("desired", "achieved") imsize = variant['imsize'] action_dim = env.action_space.low.size qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) policy = CNNPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=0, output_size=action_dim, input_channels=3 * 2, output_activation=torch.tanh, **variant['cnn_params'], ) target_qf1 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) target_qf2 = MergedCNN(input_width=imsize, input_height=imsize, output_size=1, input_channels=3 * 2, added_fc_input_size=action_dim, **variant['cnn_params']) target_policy = CNNPolicy( input_width=imsize, input_height=imsize, added_fc_input_size=0, output_size=action_dim, input_channels=3 * 2, output_activation=torch.tanh, **variant['cnn_params'], ) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs']) algo_kwargs = variant['algo_kwargs'] algo_kwargs['replay_buffer'] = replay_buffer base_kwargs = algo_kwargs['base_kwargs'] base_kwargs['training_env'] = env base_kwargs['render'] = variant["render"] base_kwargs['render_during_eval'] = variant["render"] her_kwargs = algo_kwargs['her_kwargs'] her_kwargs['observation_key'] = observation_key her_kwargs['desired_goal_key'] = desired_goal_key algorithm = HerTd3(env, qf1=qf1, qf2=qf2, policy=policy, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, exploration_policy=exploration_policy, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def generate_vae_dataset(variant): env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', None) env_id = variant.get('env_id', None) N = variant.get('N', 10000) test_p = variant.get('test_p', 0.9) use_cached = variant.get('use_cached', True) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) show = variant.get('show', False) init_camera = variant.get('init_camera', None) dataset_path = variant.get('dataset_path', None) oracle_dataset_using_set_to_goal = variant.get( 'oracle_dataset_using_set_to_goal', False) random_rollout_data = variant.get('random_rollout_data', False) random_and_oracle_policy_data = variant.get( 'random_and_oracle_policy_data', False) random_and_oracle_policy_data_split = variant.get( 'random_and_oracle_policy_data_split', 0) policy_file = variant.get('policy_file', None) n_random_steps = variant.get('n_random_steps', 100) vae_dataset_specific_env_kwargs = variant.get( 'vae_dataset_specific_env_kwargs', None) save_file_prefix = variant.get('save_file_prefix', None) non_presampled_goal_img_is_garbage = variant.get( 'non_presampled_goal_img_is_garbage', None) tag = variant.get('tag', '') from multiworld.core.image_env import ImageEnv, unormalize_image import rlkit.torch.pytorch_util as ptu info = {} if dataset_path is not None: dataset = load_local_or_remote_file(dataset_path) N = dataset.shape[0] else: if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_random_oracle_split_{}{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera else '', imsize, random_and_oracle_policy_data_split, tag, ) if use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, non_presampled_goal_img_is_garbage= non_presampled_goal_img_is_garbage, ) else: imsize = env.imsize env.non_presampled_goal_img_is_garbage = non_presampled_goal_img_is_garbage env.reset() info['env'] = env if random_and_oracle_policy_data: policy_file = load_local_or_remote_file(policy_file) policy = policy_file['policy'] policy.to(ptu.device) if random_rollout_data: from rlkit.exploration_strategies.ou_strategy import OUStrategy policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) for i in range(N): if random_and_oracle_policy_data: num_random_steps = int(N * random_and_oracle_policy_data_split) if i < num_random_steps: env.reset() for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] else: obs = env.reset() policy.reset() for _ in range(n_random_steps): policy_obs = np.hstack(( obs['state_observation'], obs['state_desired_goal'], )) action, _ = policy.get_action(policy_obs) obs, _, _, _ = env.step(action) elif oracle_dataset_using_set_to_goal: print(i) goal = env.sample_goal() env.set_to_goal(goal) obs = env._get_obs() elif random_rollout_data: if i % n_random_steps == 0: g = dict( state_desired_goal=env.sample_goal_for_rollout()) env.set_to_goal(g) policy.reset() # env.reset() u = policy.get_action_from_raw_action( env.action_space.sample()) obs = env.step(u)[0] else: env.reset() for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] img = obs['image_observation'] dataset[i, :] = unormalize_image(img) if show: img = img.reshape(3, imsize, imsize).transpose() img = img[::-1, :, ::-1] cv2.imshow('img', img) cv2.waitKey(1) # radius = input('waiting...') print("done making training data", filename, time.time() - now) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
from PIL import Image import timeit from rlkit.torch.her.her import HERTrainer from rlkit.torch.sac.policies import MakeDeterministic from rlkit.torch.sac.sac import SACTrainer from rlkit.util.io import load_local_or_remote_file from rlkit.util.video import dump_video import gym import multiworld from rlkit.torch.vae.vae_trainer import ConvVAETrainer from multiworld.core.image_env import ImageEnv, unormalize_image from rlkit.samplers.data_collector.vae_env import VAEWrappedEnvPathCollector from rlkit.torch.skewfit.online_vae_algorithm import OnlineVaeAlgorithm multiworld.register_all_envs() def skewfit_full_experiment(variant): # variant['skewfit_variant']['save_vae_data'] = True # full_experiment_variant_preprocess(variant) train_vae_and_update_variant(variant) skewfit_experiment(variant) def full_experiment_variant_preprocess(variant): train_vae_variant = variant['train_vae_variant'] skewfit_variant = variant['skewfit_variant'] env_id = variant['env_id'] init_camera = variant.get('init_camera', None) image_size = variant.get('image_size', 84)
def train_vae_and_update_variant(variant): # actually pretrain vae and ROLL. skewfit_variant = variant['skewfit_variant'] train_vae_variant = variant['train_vae_variant'] # prepare the background subtractor needed to perform segmentation if 'unet' in skewfit_variant['segmentation_method']: print("training opencv background model!") v = train_vae_variant['generate_lstm_dataset_kwargs'] env_id = v.get('env_id', None) env_id_invis = invisiable_env_id[env_id] import gym import multiworld multiworld.register_all_envs() obj_invisible_env = gym.make(env_id_invis) init_camera = v.get('init_camera', None) presampled_goals = None if skewfit_variant.get("presampled_goals_path") is not None: presampled_goals = load_local_or_remote_file( skewfit_variant['presampled_goals_path']).item() print("presampled goal path is: ", skewfit_variant['presampled_goals_path']) obj_invisible_env = ImageEnv( obj_invisible_env, v.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, presampled_goals=presampled_goals, ) train_num = 2000 if 'Push' in env_id else 4000 train_bgsb(obj_invisible_env, train_num=train_num) if skewfit_variant.get('vae_path', None) is None: # train new vaes logger.remove_tabular_output('progress.csv', relative_to_snapshot_dir=True) vaes, vae_train_datas, vae_test_datas = train_vae( train_vae_variant, skewfit_variant=skewfit_variant, return_data=True) # one original vae, one segmented ROLL. if skewfit_variant.get('save_vae_data', False): skewfit_variant['vae_train_data'] = vae_train_datas skewfit_variant['vae_test_data'] = vae_test_datas logger.add_tabular_output( 'progress.csv', relative_to_snapshot_dir=True, ) skewfit_variant['vae_path'] = vaes # just pass the VAE directly else: # load pre-trained vaes print("load pretrain scene-/objce-VAE from: {}".format( skewfit_variant['vae_path'])) data = torch.load(osp.join(skewfit_variant['vae_path'], 'params.pkl')) vae_original = data['vae_original'] vae_segmented = data['lstm_segmented'] skewfit_variant['vae_path'] = [vae_segmented, vae_original] generate_vae_dataset_fctn = train_vae_variant.get( 'generate_vae_data_fctn', generate_vae_dataset) generate_lstm_dataset_fctn = train_vae_variant.get( 'generate_lstm_data_fctn') assert generate_lstm_dataset_fctn is not None, "Must provide a custom generate lstm pretraining dataset function!" train_data_lstm, test_data_lstm, info_lstm = generate_lstm_dataset_fctn( train_vae_variant['generate_lstm_dataset_kwargs'], segmented=True, segmentation_method=skewfit_variant['segmentation_method']) train_data_ori, test_data_ori, info_ori = generate_vae_dataset_fctn( train_vae_variant['generate_vae_dataset_kwargs']) train_datas = [train_data_lstm, train_data_ori] test_datas = [test_data_lstm, test_data_ori] if skewfit_variant.get('save_vae_data', False): skewfit_variant['vae_train_data'] = train_datas skewfit_variant['vae_test_data'] = test_datas
def state_td3bc_experiment(variant): if variant.get('env_id', None): import gym import multiworld multiworld.register_all_envs() eval_env = gym.make(variant['env_id']) expl_env = gym.make(variant['env_id']) else: eval_env_kwargs = variant.get('eval_env_kwargs', variant['env_kwargs']) eval_env = variant['env_class'](**eval_env_kwargs) expl_env = variant['env_class'](**variant['env_kwargs']) observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es_strat = variant.get('es', 'ou') if es_strat == 'ou': es = OUStrategy( action_space=expl_env.action_space, max_sigma=variant['exploration_noise'], min_sigma=variant['exploration_noise'], ) elif es_strat == 'gauss_eps': es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) else: raise ValueError("invalid exploration strategy provided") obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) demo_train_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size'] ) demo_test_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, max_size=variant['replay_buffer_kwargs']['max_size'], ) if variant.get('td3_bc', True): td3_trainer = TD3BCTrainer( env=expl_env, policy=policy, qf1=qf1, qf2=qf2, replay_buffer=replay_buffer, demo_train_buffer=demo_train_buffer, demo_test_buffer=demo_test_buffer, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_bc_trainer_kwargs'] ) else: td3_trainer = TD3( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['td3_trainer_kwargs'] ) trainer = HERTrainer(td3_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) if variant.get("save_video", True): if variant.get("presampled_goals", None): variant['image_env_kwargs']['presampled_goals'] = load_local_or_remote_file(variant['presampled_goals']).item() image_eval_env = ImageEnv(eval_env, **variant["image_env_kwargs"]) image_eval_path_collector = GoalConditionedPathCollector( image_eval_env, policy, observation_key='state_observation', desired_goal_key='state_desired_goal', ) image_expl_env = ImageEnv(expl_env, **variant["image_env_kwargs"]) image_expl_path_collector = GoalConditionedPathCollector( image_expl_env, expl_policy, observation_key='state_observation', desired_goal_key='state_desired_goal', ) video_func = VideoSaveFunction( image_eval_env, variant, image_expl_path_collector, image_eval_path_collector, ) algorithm.post_train_funcs.append(video_func) algorithm.to(ptu.device) if variant.get('load_demos', False): td3_trainer.load_demos() if variant.get('pretrain_policy', False): td3_trainer.pretrain_policy_with_bc() if variant.get('pretrain_rl', False): td3_trainer.pretrain_q_with_bc_data() algorithm.train()
def experiment(variant): import multiworld multiworld.register_all_envs() env = gym.make('Point2DEnv-ImageFixedGoal-v0') input_width, input_height = env.image_shape action_dim = int(np.prod(env.action_space.shape)) cnn_params = variant['cnn_params'] cnn_params.update( input_width=input_width, input_height=input_height, input_channels=3, output_conv_channels=True, output_size=None, ) if variant['shared_qf_conv']: qf_cnn = CNN(**cnn_params) qf_obs_processor = nn.Sequential(qf_cnn, Flatten()) qf1 = MlpQfWithObsProcessor(obs_processor=qf_obs_processor, output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=qf_obs_processor, output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf_cnn = CNN(**cnn_params) target_qf_obs_processor = nn.Sequential(target_qf_cnn, Flatten()) target_qf1 = MlpQfWithObsProcessor( obs_processor=target_qf_obs_processor, output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=target_qf_obs_processor, output_size=1, input_size=action_dim + qf_cnn.conv_output_flat_size, **variant['qf_kwargs']) else: qf1_cnn = CNN(**cnn_params) cnn_output_dim = qf1_cnn.conv_output_flat_size qf1 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( qf1_cnn, Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) qf2 = MlpQfWithObsProcessor(obs_processor=nn.Sequential( CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf1 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) target_qf2 = MlpQfWithObsProcessor( obs_processor=nn.Sequential(CNN(**cnn_params), Flatten()), output_size=1, input_size=action_dim + cnn_output_dim, **variant['qf_kwargs']) action_dim = int(np.prod(env.action_space.shape)) policy_cnn = CNN(**cnn_params) policy = TanhGaussianPolicyAdapter(nn.Sequential(policy_cnn, Flatten()), policy_cnn.conv_output_flat_size, action_dim, **variant['policy_kwargs']) eval_env = expl_env = env eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, **variant['eval_path_collector_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) trainer = SACTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['trainer_kwargs']) if variant['collection_mode'] == 'batch': expl_path_collector = MdpPathCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) elif variant['collection_mode'] == 'online': expl_path_collector = MdpStepCollector( expl_env, policy, **variant['expl_path_collector_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs']) algorithm.to(ptu.device) algorithm.train()
def main(training_data_dir, validation_data_dir, test_data_dir, imsize): if not os.path.exists(training_data_dir): os.makedirs(training_data_dir) if not os.path.exists(validation_data_dir): os.makedirs(validation_data_dir) if not os.path.exists(test_data_dir): os.makedirs(test_data_dir) backSub = cv2.createBackgroundSubtractorMOG2(history=10) # Registering required nmultiworld environments. multiworld.register_all_envs() base_env_background = gym.make('SawyerPushHurdlePuckAndRobotInvisible-v0') env_background = ImageEnv(base_env_background, imsize=imsize, init_camera=camera, transpose=True, normalize=True) env_background.reset() base_env_hurdle = gym.make('SawyerPushHurdlePuckInvisible-v0') env_hurdle = ImageEnv(base_env_hurdle, imsize=imsize, init_camera=camera, transpose=True, normalize=True) env_hurdle.reset() # Generating training, validation and test data print("Training background subctractor") for i in range(10): action = env_background.action_space.sample() next_obs, reward, done, info = env_background.step(action) image = next_obs['observation'] image = unnormalize_image(image) bg_mask = backSub.apply(image, learningRate=-1) print("Generating training data") for i in range(1000): action = env_hurdle.action_space.sample() next_obs, reward, done, info = env_hurdle.step(action) image = next_obs['observation'] image = unnormalize_image(image) fg_mask = backSub.apply(image, learningRate=0) fg_mask = fg_mask / fg_mask.max() cv2.imwrite(training_data_dir + "/rgb_" + str(i) + ".jpg", image) cv2.imwrite(training_data_dir + "/mask_" + str(i) + ".jpg", fg_mask) print("Generating validation data") for i in range(200): action = env_hurdle.action_space.sample() next_obs, reward, done, info = env_hurdle.step(action) image = next_obs['observation'] image = unnormalize_image(image) fg_mask = backSub.apply(image, learningRate=0) fg_mask = fg_mask / fg_mask.max() cv2.imwrite(validation_data_dir + "/rgb_" + str(i) + ".jpg", image) cv2.imwrite(validation_data_dir + "/mask_" + str(i) + ".jpg", fg_mask) print("Generating testing data") for i in range(200): action = env_hurdle.action_space.sample() next_obs, reward, done, info = env_hurdle.step(action) image = next_obs['observation'] image = unnormalize_image(image) fg_mask = backSub.apply(image, learningRate=0) fg_mask = fg_mask / fg_mask.max() cv2.imwrite(test_data_dir + "/rgb_" + str(i) + ".jpg", image) cv2.imwrite(test_data_dir + "/mask_" + str(i) + ".jpg", fg_mask) print("Completed generating data at:", training_data_dir)
def experiment(variant): import multiworld multiworld.register_all_envs() # unwrap the TimeLimitEnv wrapper since we manually termiante after 50 steps eval_env = gym.make('SawyerPushXYZEnv-v0') expl_env = gym.make('SawyerPushXYZEnv-v0') observation_key = 'state_observation' desired_goal_key = 'desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) obs_dim = eval_env.observation_space.spaces['observation'].low.size action_dim = eval_env.action_space.low.size goal_dim = eval_env.observation_space.spaces['desired_goal'].low.size qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + action_dim + goal_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **variant['policy_kwargs'] ) eval_policy = MakeDeterministic(policy) trainer = SACTrainer( env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **variant['sac_trainer_kwargs'] ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, eval_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def generate_vae_dataset_from_params( env_class=None, env_kwargs=None, env_id=None, N=10000, test_p=0.9, use_cached=True, imsize=84, num_channels=1, show=False, init_camera=None, dataset_path=None, oracle_dataset=False, n_random_steps=100, vae_dataset_specific_env_kwargs=None, save_file_prefix=None, ): from multiworld.core.image_env import ImageEnv, unormalize_image import time assert oracle_dataset == True if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_oracle{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera else '', imsize, oracle_dataset, ) info = {} if dataset_path is not None: filename = local_path_from_s3_or_local_path(dataset_path) dataset = np.load(filename) np.random.shuffle(dataset) N = dataset.shape[0] elif use_cached and osp.isfile(filename): dataset = np.load(filename) np.random.shuffle(dataset) print("loaded data from saved file", filename) else: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, ) setup_pickup_image_env(env, num_presampled_goals=N) env.reset() info['env'] = env dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) for i in range(N): img = env._presampled_goals['image_desired_goal'][i] dataset[i, :] = unormalize_image(img) if show: img = img.reshape(3, imsize, imsize).transpose() img = img[::-1, :, ::-1] cv2.imshow('img', img) cv2.waitKey(1) time.sleep(.2) # radius = input('waiting...') print("done making training data", filename, time.time() - now) np.random.shuffle(dataset) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def generate_vae_dataset(variant): """ If not provided a pre-train vae dataset generation function, this function will be used to collect the dataset for training vae. """ import rlkit.torch.pytorch_util as ptu import gym import multiworld multiworld.register_all_envs() print("generating vae dataset with original images") env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', None) env_id = variant.get('env_id', None) N = variant.get('N', 10000) test_p = variant.get('test_p', 0.9) use_cached = variant.get('use_cached', True) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) show = variant.get('show', False) init_camera = variant.get('init_camera', None) dataset_path = variant.get('dataset_path', None) oracle_dataset_using_set_to_goal = variant.get( 'oracle_dataset_using_set_to_goal', False) random_rollout_data = variant.get('random_rollout_data', False) random_and_oracle_policy_data = variant.get( 'random_and_oracle_policy_data', False) random_and_oracle_policy_data_split = variant.get( 'random_and_oracle_policy_data_split', 0) policy_file = variant.get('policy_file', None) n_random_steps = variant.get('n_random_steps', 100) vae_dataset_specific_env_kwargs = variant.get( 'vae_dataset_specific_env_kwargs', None) save_file_prefix = variant.get('save_file_prefix', None) non_presampled_goal_img_is_garbage = variant.get( 'non_presampled_goal_img_is_garbage', None) tag = variant.get('tag', '') info = {} if dataset_path is not None: print('load vae training dataset from: ', dataset_path) pjhome = os.environ['PJHOME'] dataset = np.load(osp.join(pjhome, dataset_path), allow_pickle=True).item() if isinstance(dataset, dict): dataset = dataset['image_desired_goal'] dataset = unormalize_image(dataset) N = dataset.shape[0] else: if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_random_oracle_split_{}{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera else '', imsize, random_and_oracle_policy_data_split, tag, ) if use_cached and osp.isfile(filename): dataset = np.load(filename) print("loaded data from saved file", filename) else: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, non_presampled_goal_img_is_garbage= non_presampled_goal_img_is_garbage, ) else: imsize = env.imsize env.non_presampled_goal_img_is_garbage = non_presampled_goal_img_is_garbage env.reset() info['env'] = env if random_and_oracle_policy_data: policy_file = load_local_or_remote_file(policy_file) policy = policy_file['policy'] policy.to(ptu.device) if random_rollout_data: from rlkit.exploration_strategies.ou_strategy import OUStrategy policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) for i in range(N): if random_and_oracle_policy_data: num_random_steps = int(N * random_and_oracle_policy_data_split) if i < num_random_steps: env.reset() for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] else: obs = env.reset() policy.reset() for _ in range(n_random_steps): policy_obs = np.hstack(( obs['state_observation'], obs['state_desired_goal'], )) action, _ = policy.get_action(policy_obs) obs, _, _, _ = env.step(action) elif oracle_dataset_using_set_to_goal: print(i) goal = env.sample_goal() env.set_to_goal(goal) obs = env._get_obs() elif random_rollout_data: if i % n_random_steps == 0: g = dict( state_desired_goal=env.sample_goal_for_rollout()) env.set_to_goal(g) policy.reset() # env.reset() u = policy.get_action_from_raw_action( env.action_space.sample()) obs = env.step(u)[0] else: print("using totally random rollouts") for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] img = obs[ 'image_observation'] # NOTE yufei: this is already normalized image, of detype np.float64. dataset[i, :] = unormalize_image(img) np.save(filename, dataset) n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] return train_dataset, test_dataset, info
def generate_LSTM_vae_only_dataset(variant, segmented=False, segmentation_method='color'): from multiworld.core.image_env import ImageEnv, unormalize_image env_id = variant.get('env_id', None) N = variant.get('N', 500) test_p = variant.get('test_p', 0.9) imsize = variant.get('imsize', 48) num_channels = variant.get('num_channels', 3) init_camera = variant.get('init_camera', None) occlusion_prob = variant.get('occlusion_prob', 0) occlusion_level = variant.get('occlusion_level', 0.5) segmentation_kwargs = variant.get('segmentation_kwargs', {}) if segmentation_kwargs.get('segment') is not None: segmented = segmentation_kwargs.get('segment') assert env_id is not None, 'you must provide an env id!' obj = 'puck-pos' if env_id == 'SawyerDoorHookResetFreeEnv-v1': obj = 'door-angle' pjhome = os.environ['PJHOME'] if segmented: if 'unet' in segmentation_method: seg_name = 'seg-unet' else: seg_name = 'seg-' + segmentation_method else: seg_name = 'no-seg' if env_id == 'SawyerDoorHookResetFreeEnv-v1': seg_name += '-2' data_file_path = osp.join( pjhome, 'data/local/pre-train-lstm', 'vae-only-{}-{}-{}-{}-{}.npy'.format(env_id, seg_name, N, occlusion_prob, occlusion_level)) obj_state_path = osp.join( pjhome, 'data/local/pre-train-lstm', 'vae-only-{}-{}-{}-{}-{}-{}.npy'.format(env_id, seg_name, N, occlusion_prob, occlusion_level, obj)) print(data_file_path) if osp.exists(data_file_path): all_data = np.load(data_file_path) if len(all_data) >= N: print("load stored data at: ", data_file_path) n = int(len(all_data) * test_p) train_dataset = all_data[:n] test_dataset = all_data[n:] obj_states = np.load(obj_state_path) info = {'obj_state': obj_states} return train_dataset, test_dataset, info if segmented: print( "generating lstm vae pretrain only dataset with segmented images using method: ", segmentation_method) if segmentation_method == 'unet': segment_func = segment_image_unet else: raise NotImplementedError else: print("generating lstm vae pretrain only dataset with original images") info = {} dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) imgs = [] obj_states = None if env_id == 'SawyerDoorHookResetFreeEnv-v1': from rlkit.util.io import load_local_or_remote_file pjhome = os.environ['PJHOME'] pre_sampled_goal_path = osp.join( pjhome, 'data/local/pre-train-vae/door_original_dataset.npy') goal_dict = np.load(pre_sampled_goal_path, allow_pickle=True).item() imgs = goal_dict['image_desired_goal'] door_angles = goal_dict['state_desired_goal'][:, -1] obj_states = door_angles[:, np.newaxis] elif env_id == 'SawyerPickupEnvYZEasy-v0': from rlkit.util.io import load_local_or_remote_file pjhome = os.environ['PJHOME'] pre_sampled_goal_path = osp.join( pjhome, 'data/local/pre-train-vae/pickup-original-dataset.npy') goal_dict = load_local_or_remote_file(pre_sampled_goal_path).item() imgs = goal_dict['image_desired_goal'] puck_pos = goal_dict['state_desired_goal'][:, 3:] obj_states = puck_pos else: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, ) env.reset() info['env'] = env puck_pos = np.zeros((N, 2), dtype=np.float) for i in range(N): print("lstm vae pretrain only dataset generation, number: ", i) if env_id == 'SawyerPushHurdle-v0': obs, puck_p = _generate_sawyerhurdle_dataset( env, return_puck_pos=True, segmented=segmented) elif env_id == 'SawyerPushHurdleMiddle-v0': obs, puck_p = _generate_sawyerhurdlemiddle_dataset( env, return_puck_pos=True) elif env_id == 'SawyerPushNIPSEasy-v0': obs, puck_p = _generate_sawyerpushnipseasy_dataset( env, return_puck_pos=True) elif env_id == 'SawyerPushHurdleResetFreeEnv-v0': obs, puck_p = _generate_sawyerhurldeblockresetfree_dataset( env, return_puck_pos=True) else: raise NotImplementedError img = obs[ 'image_observation'] # NOTE: this is already normalized image, of detype np.float64. imgs.append(img) puck_pos[i] = puck_p obj_states = puck_pos # now we segment the images for i in range(N): print("segmenting image ", i) img = imgs[i] if segmented: dataset[i, :] = segment_func(img, normalize=False, **segmentation_kwargs) p = np.random.rand( ) # manually drop some images, so as to make occlusions if p < occlusion_prob: mask = (np.random.uniform(low=0, high=1, size=(imsize, imsize)) > occlusion_level).astype(np.uint8) img = dataset[i].reshape(3, imsize, imsize).transpose() img[mask < 1] = 0 dataset[i] = img.transpose().flatten() else: dataset[i, :] = unormalize_image(img) # add the trajectory dimension dataset = dataset[:, np.newaxis, :] # batch_size x traj_len = 1 x imlen obj_states = obj_states[:, np.newaxis, :] # batch_size x traj_len = 1 x imlen info['obj_state'] = obj_states n = int(N * test_p) train_dataset = dataset[:n] test_dataset = dataset[n:] if N >= 500: print('save data to: ', data_file_path) all_data = np.concatenate([train_dataset, test_dataset], axis=0) np.save(data_file_path, all_data) np.save(obj_state_path, obj_states) return train_dataset, test_dataset, info
def _disentangled_her_twin_sac_experiment_v2( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters latent_dim=2, save_video_kwargs=None, **kwargs ): import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = ConcatMlp( input_size=goal_dim, output_size=latent_dim, **encoder_kwargs ) qf1 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) qf2 = DisentangledMlpQf( encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf1 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) target_qf2 = DisentangledMlpQf( encoder=Detach(encoder), preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, **disentangled_qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) sac_trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial(add_heatmap_imgs_to_o_dict, v_function=v_function) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, policy, tag="train", get_extra_imgs=partial(get_extra_imgs, img_keys=img_keys), **save_video_kwargs ) decoder = ConcatMlp( input_size=obs_dim, output_size=obs_dim, hidden_sizes=[128, 128], ) decoder.to(ptu.device) # algorithm.post_train_funcs.append(train_decoder(variant, encoder, decoder)) # algorithm.post_train_funcs.append(plot_encoder_function(variant, encoder)) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def _disentangled_grill_her_twin_sac_experiment( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, vae_evaluation_goal_sampling_mode, vae_exploration_goal_sampling_mode, base_env_evaluation_goal_sampling_mode, base_env_exploration_goal_sampling_mode, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', latent_dim=2, vae_wrapped_env_kwargs=None, vae_path=None, vae_n_vae_training_kwargs=None, vectorized=False, save_video=True, save_video_kwargs=None, have_no_disentangled_encoder=False, **kwargs): if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) train_env.goal_sampling_mode = base_env_exploration_goal_sampling_mode eval_env.goal_sampling_mode = base_env_evaluation_goal_sampling_mode if vae_path: vae = load_local_or_remote_file(vae_path) else: vae = get_n_train_vae(latent_dim=latent_dim, env=eval_env, **vae_n_vae_training_kwargs) train_env = VAEWrappedEnv(train_env, vae, imsize=train_env.imsize, **vae_wrapped_env_kwargs) eval_env = VAEWrappedEnv(eval_env, vae, imsize=train_env.imsize, **vae_wrapped_env_kwargs) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size action_dim = train_env.action_space.low.size encoder = FlattenMlp(input_size=obs_dim, output_size=latent_dim, **encoder_kwargs) def make_qf(): if have_no_disentangled_encoder: return FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **qf_kwargs, ) else: return DisentangledMlpQf(goal_processor=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, vectorized=vectorized, **disentangled_qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **replay_buffer_kwargs) sac_trainer = SACTrainer(env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs) trainer = HERTrainer(sac_trainer) eval_path_collector = VAEWrappedEnvPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=vae_evaluation_goal_sampling_mode, ) expl_path_collector = VAEWrappedEnvPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=vae_exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs, ) algorithm.to(ptu.device) if save_video: save_vf_heatmap = save_video_kwargs.get('save_vf_heatmap', True) if have_no_disentangled_encoder: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action) add_heatmap = partial(add_heatmap_img_to_o_dict, v_function=v_function) else: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial( add_heatmap_imgs_to_o_dict, v_function=v_function, vectorized=vectorized, ) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function(rollout_function, eval_env, MakeDeterministic(policy), get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="eval", **save_video_kwargs) train_video_func = get_save_video_function(rollout_function, train_env, policy, get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="train", **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
def her_sac_experiment( max_path_length, qf_kwargs, twin_sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, save_video=True, env_id=None, env_class=None, env_kwargs=None, observation_key='state_observation', desired_goal_key='state_desired_goal', achieved_goal_key='state_achieved_goal', # Video parameters save_video_kwargs=None, exploration_policy_kwargs=None, **kwargs ): if exploration_policy_kwargs is None: exploration_policy_kwargs = {} import rlkit.samplers.rollout_functions as rf import rlkit.torch.pytorch_util as ptu from rlkit.data_management.obs_dict_replay_buffer import \ ObsDictRelabelingBuffer from rlkit.torch.networks import ConcatMlp from rlkit.torch.sac.policies import TanhGaussianPolicy from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm if not save_video_kwargs: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class if env_id: import gym import multiworld multiworld.register_all_envs() train_env = gym.make(env_id) eval_env = gym.make(env_id) else: eval_env = env_class(**env_kwargs) train_env = env_class(**env_kwargs) obs_dim = ( train_env.observation_space.spaces[observation_key].low.size + train_env.observation_space.spaces[desired_goal_key].low.size ) action_dim = train_env.action_space.low.size qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf1 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) target_qf2 = ConcatMlp( input_size=obs_dim + action_dim, output_size=1, **qf_kwargs ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **policy_kwargs ) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **replay_buffer_kwargs ) trainer = SACTrainer( env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **twin_sac_trainer_kwargs ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=evaluation_goal_sampling_mode, ) exploration_policy = create_exploration_policy( train_env, policy, **exploration_policy_kwargs) expl_path_collector = GoalConditionedPathCollector( train_env, exploration_policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode=exploration_goal_sampling_mode, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs ) algorithm.to(ptu.device) if save_video: rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, return_dict_obs=True, ) eval_video_func = get_save_video_function( rollout_function, eval_env, MakeDeterministic(policy), tag="eval", **save_video_kwargs ) train_video_func = get_save_video_function( rollout_function, train_env, exploration_policy, tag="expl", **save_video_kwargs ) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_achieved_goal')) # algorithm.post_train_funcs.append(plot_buffer_function( # save_video_period, 'state_desired_goal')) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
self.hand_and_puck_space = Box(hand_and_puck_low, hand_and_puck_high, dtype=np.float32) self.observation_space = Dict([ ('observation', self.hand_and_puck_space), ('desired_goal', self.hand_and_puck_space), ('achieved_goal', self.hand_and_puck_space), ('state_observation', self.hand_and_puck_space), ('state_desired_goal', self.hand_and_puck_space), ('state_achieved_goal', self.hand_and_puck_space), ('proprio_observation', self.hand_space), ('proprio_desired_goal', self.hand_space), ('proprio_achieved_goal', self.hand_space), ]) def step(self, action): delta_z = self.hand_z_position - self.data.mocap_pos[0, 2] action = np.hstack((action, delta_z)) return super().step(action) if __name__ == '__main__': register_all_envs() env = gym.make('SawyerPushAndReachArenaEnv-v0', goal_type='puck', dense_reward=True, task_agnostic=False) for i in range(100): # env.reset_to_new_start_state(start_pos=[.2, .8, 0.07, .1, 0.6]) env.reset_to_new_start_state(start_pos=np.random.uniform(env.goal_low, env.goal_high)) env.set_goal(np.random.uniform(env.goal_low, env.goal_high)) for _ in range(10): env.step(np.random.uniform([-1, -1], [1, 1])) env.render()
def getdata(variant): skewfit_variant = variant['skewfit_variant'] print('-------------------------------') skewfit_preprocess_variant(skewfit_variant) skewfit_variant['render'] = True vae_environment = get_envs(skewfit_variant) print('done loading vae_env') env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', None) env_id = variant.get('env_id', None) N = variant.get('N', 10000) test_p = variant.get('test_p', 0.9) use_cached = variant.get('use_cached', True) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) show = variant.get('show', False) init_camera = variant.get('init_camera', None) dataset_path = variant.get('dataset_path', None) oracle_dataset_using_set_to_goal = variant.get( 'oracle_dataset_using_set_to_goal', False) random_rollout_data = variant.get('random_rollout_data', False) random_and_oracle_policy_data = variant.get( 'random_and_oracle_policy_data', False) random_and_oracle_policy_data_split = variant.get( 'random_and_oracle_policy_data_split', 0) policy_file = variant.get('policy_file', None) n_random_steps = variant.get('n_random_steps', 100) vae_dataset_specific_env_kwargs = variant.get( 'vae_dataset_specific_env_kwargs', None) save_file_prefix = variant.get('save_file_prefix', None) non_presampled_goal_img_is_garbage = variant.get( 'non_presampled_goal_img_is_garbage', None) tag = variant.get('tag', '') from multiworld.core.image_env import ImageEnv, unormalize_image import rlkit.torch.pytorch_util as ptu info = {} if dataset_path is not None: dataset = load_local_or_remote_file(dataset_path) N = dataset.shape[0] else: if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_random_oracle_split_{}{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera else '', imsize, random_and_oracle_policy_data_split, tag, ) if True: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): print("using(ImageEnv)") env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, non_presampled_goal_img_is_garbage= non_presampled_goal_img_is_garbage, ) else: imsize = env.imsize env.non_presampled_goal_img_is_garbage = non_presampled_goal_img_is_garbage env.reset() info['env'] = env if random_and_oracle_policy_data: policy_file = load_local_or_remote_file(policy_file) policy = policy_file['policy'] policy.to(ptu.device) if random_rollout_data: from rlkit.exploration_strategies.ou_strategy import OUStrategy policy = OUStrategy(env.action_space) dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) for i in range(10): NP = [] if True: print(i) #print('th step') goal = env.sample_goal() # print("goal___________________________") # print(goal) # print("goal___________________________") env.set_to_goal(goal) obs = env._get_obs() #img = img.reshape(3, imsize, imsize).transpose() # img = img[::-1, :, ::-1] # cv2.imshow('img', img) # cv2.waitKey(1) img_1 = obs['image_observation'] img_1 = img_1.reshape(3, imsize, imsize).transpose() NP.append(img_1) if i % 3 == 0: cv2.imshow('img1', img_1) cv2.waitKey(1) #img_1_reconstruct = vae_environment._reconstruct_img(obs['image_observation']).transpose() encoded_1 = vae_environment._get_encoded( obs['image_observation']) print(encoded_1) NP.append(encoded_1) img_1_reconstruct = vae_environment._get_img( encoded_1).transpose() NP.append(img_1_reconstruct) #dataset[i, :] = unormalize_image(img) # img_1 = img_1.reshape(3, imsize, imsize).transpose() if i % 3 == 0: cv2.imshow('img1_reconstruction', img_1_reconstruct) cv2.waitKey(1) env.reset() instr = env.generate_new_state(goal) if i % 3 == 0: print(instr) obs = env._get_obs() # obs = env._get_obs() img_2 = obs['image_observation'] img_2 = img_2.reshape(3, imsize, imsize).transpose() NP.append(img_2) if i % 3 == 0: cv2.imshow('img2', img_2) cv2.waitKey(1) #img_2_reconstruct = vae_environment._reconstruct_img(obs['image_observation']).transpose() encoded_2 = vae_environment._get_encoded( obs['image_observation']) NP.append(encoded_2) img_2_reconstruct = vae_environment._get_img( encoded_2).transpose() NP.append(img_2_reconstruct) NP.append(instr) # img_2 = img_2.reshape(3, imsize, imsize).transpose() if i % 3 == 0: cv2.imshow('img2_reconstruct', img_2_reconstruct) cv2.waitKey(1) NP = np.array(NP) idx = str(i) name = "/home/xiaomin/Downloads/IFIG_DATA_1/" + idx + ".npy" np.save(open(name, 'wb'), NP) # radius = input('waiting...') # #get the in between functions import dill import pickle get_encoded = dill.dumps(vae_environment._get_encoded) with open( "/home/xiaomin/Downloads/IFIG_encoder_decoder/get_encoded_1000_epochs_one_puck.txt", "wb") as fp: pickle.dump(get_encoded, fp) with open( "/home/xiaomin/Downloads/IFIG_encoder_decoder/get_encoded_1000_epochs_one_puck.txt", "rb") as fp: b = pickle.load(fp) func_get_encoded = dill.loads(b) encoded = func_get_encoded(obs['image_observation']) print(encoded) print('------------------------------') get_img = dill.dumps(vae_environment._get_img) with open( "/home/xiaomin/Downloads/IFIG_encoder_decoder/get_img_1000_epochs_one_puck.txt", "wb") as fp: pickle.dump(get_img, fp) with open( "/home/xiaomin/Downloads/IFIG_encoder_decoder/get_img_1000_epochs_one_puck.txt", "rb") as fp: c = pickle.load(fp) func_get_img = dill.loads(c) img_1_reconstruct = func_get_img(encoded).transpose() print(img_1_reconstruct) #dataset[i, :] = unormalize_image(img) # img_1 = img_1.reshape(3, imsize, imsize).transpose() cv2.imshow('test', img_1_reconstruct) cv2.waitKey(0) print("done making training data", filename, time.time() - now) np.save(filename, dataset)
def _use_disentangled_encoder_distance( max_path_length, encoder_kwargs, disentangled_qf_kwargs, qf_kwargs, sac_trainer_kwargs, replay_buffer_kwargs, policy_kwargs, evaluation_goal_sampling_mode, exploration_goal_sampling_mode, algo_kwargs, env_id=None, env_class=None, env_kwargs=None, encoder_key_prefix='encoder', encoder_input_prefix='state', latent_dim=2, reward_mode=EncoderWrappedEnv.ENCODER_DISTANCE_REWARD, # Video parameters save_video=True, save_video_kwargs=None, save_vf_heatmap=True, **kwargs): if save_video_kwargs is None: save_video_kwargs = {} if env_kwargs is None: env_kwargs = {} assert env_id or env_class vectorized = ( reward_mode == EncoderWrappedEnv.VECTORIZED_ENCODER_DISTANCE_REWARD) if env_id: import gym import multiworld multiworld.register_all_envs() raw_train_env = gym.make(env_id) raw_eval_env = gym.make(env_id) else: raw_eval_env = env_class(**env_kwargs) raw_train_env = env_class(**env_kwargs) raw_train_env.goal_sampling_mode = exploration_goal_sampling_mode raw_eval_env.goal_sampling_mode = evaluation_goal_sampling_mode raw_obs_dim = ( raw_train_env.observation_space.spaces['state_observation'].low.size) action_dim = raw_train_env.action_space.low.size encoder = ConcatMlp(input_size=raw_obs_dim, output_size=latent_dim, **encoder_kwargs) encoder = Identity() encoder.input_size = raw_obs_dim encoder.output_size = raw_obs_dim np_encoder = EncoderFromNetwork(encoder) train_env = EncoderWrappedEnv( raw_train_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) eval_env = EncoderWrappedEnv( raw_eval_env, np_encoder, encoder_input_prefix, key_prefix=encoder_key_prefix, reward_mode=reward_mode, ) observation_key = '{}_observation'.format(encoder_key_prefix) desired_goal_key = '{}_desired_goal'.format(encoder_key_prefix) achieved_goal_key = '{}_achieved_goal'.format(encoder_key_prefix) obs_dim = train_env.observation_space.spaces[observation_key].low.size goal_dim = train_env.observation_space.spaces[desired_goal_key].low.size def make_qf(): return DisentangledMlpQf(encoder=encoder, preprocess_obs_dim=obs_dim, action_dim=action_dim, qf_kwargs=qf_kwargs, vectorized=vectorized, **disentangled_qf_kwargs) qf1 = make_qf() qf2 = make_qf() target_qf1 = make_qf() target_qf2 = make_qf() policy = TanhGaussianPolicy(obs_dim=obs_dim + goal_dim, action_dim=action_dim, **policy_kwargs) replay_buffer = ObsDictRelabelingBuffer( env=train_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, vectorized=vectorized, **replay_buffer_kwargs) sac_trainer = SACTrainer(env=train_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, **sac_trainer_kwargs) trainer = HERTrainer(sac_trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, MakeDeterministic(policy), max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) expl_path_collector = GoalConditionedPathCollector( train_env, policy, max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, goal_sampling_mode='env', ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=train_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, max_path_length=max_path_length, **algo_kwargs) algorithm.to(ptu.device) if save_video: def v_function(obs): action = policy.get_actions(obs) obs, action = ptu.from_numpy(obs), ptu.from_numpy(action) return qf1(obs, action, return_individual_q_vals=True) add_heatmap = partial( add_heatmap_imgs_to_o_dict, v_function=v_function, vectorized=vectorized, ) rollout_function = rf.create_rollout_function( rf.multitask_rollout, max_path_length=max_path_length, observation_key=observation_key, desired_goal_key=desired_goal_key, full_o_postprocess_func=add_heatmap if save_vf_heatmap else None, ) img_keys = ['v_vals'] + [ 'v_vals_dim_{}'.format(dim) for dim in range(latent_dim) ] eval_video_func = get_save_video_function(rollout_function, eval_env, MakeDeterministic(policy), get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="eval", **save_video_kwargs) train_video_func = get_save_video_function(rollout_function, train_env, policy, get_extra_imgs=partial( get_extra_imgs, img_keys=img_keys), tag="train", **save_video_kwargs) algorithm.post_train_funcs.append(eval_video_func) algorithm.post_train_funcs.append(train_video_func) algorithm.train()
# 10/09/19 Tim Liu changed variable bit_size to num_bits # 10/14/19 Tim Liu began modifying for SawyerReach environment # 10/16/19 Tim Liu added take_action # 10/16/19 Tim Liu changed num_epochs to 150, STEPS_PER to 50, and done_threshold -0.01 # 10/21/19 Tim Liu added rendering argument import numpy as np import tensorflow as tf # import tensorflow.contrib.slim as slim import tf_slim as slim from buffers import Buffer from matplotlib import pyplot as plt import multiworld import gym multiworld.register_all_envs() # register the multiworld environment flags = tf.app.flags flags.DEFINE_string( "HER", "None", "different strategies of choosing goal. Possible values are :- future, final, episode or None. If None HER is not used" ) flags.DEFINE_integer("num_epochs", 150, "Number of epochs to run training for") flags.DEFINE_integer("log_interval", 5, "Epochs between printing log info") flags.DEFINE_integer("opt_steps", 40, "Optimization steps in each epoch") flags.DEFINE_integer("steps_per_episode", 50, "Number of steps per epoch") flags.DEFINE_bool("render", False, "render the Sawyer arm") FLAGS = flags.FLAGS # ************ Define global variables and initialize ************ #
def get_envs(variant): from multiworld.core.image_env import ImageEnv from rlkit.envs.vae_wrapper import VAEWrappedEnv from rlkit.util.io import load_local_or_remote_file render = variant.get('render', False) vae_path = variant.get("vae_path", None) reward_params = variant.get("reward_params", dict()) init_camera = variant.get("init_camera", None) do_state_exp = variant.get("do_state_exp", False) presample_goals = variant.get('presample_goals', False) presample_image_goals_only = variant.get('presample_image_goals_only', False) presampled_goals_path = variant.get('presampled_goals_path', None) vae = load_local_or_remote_file( vae_path) if type(vae_path) is str else vae_path if 'env_id' in variant: import gym import multiworld multiworld.register_all_envs() env = gym.make(variant['env_id']) else: env = variant["env_class"](**variant['env_kwargs']) if not do_state_exp: if isinstance(env, ImageEnv): image_env = env else: image_env = ImageEnv( env, variant.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, ) if presample_goals: """ This will fail for online-parallel as presampled_goals will not be serialized. Also don't use this for online-vae. """ if presampled_goals_path is None: image_env.non_presampled_goal_img_is_garbage = True vae_env = VAEWrappedEnv(image_env, vae, imsize=image_env.imsize, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, **variant.get('vae_wrapped_env_kwargs', {})) presampled_goals = variant['generate_goal_dataset_fctn']( env=vae_env, env_id=variant.get('env_id', None), **variant['goal_generation_kwargs']) del vae_env else: presampled_goals = load_local_or_remote_file( presampled_goals_path).item() del image_env image_env = ImageEnv(env, variant.get('imsize'), init_camera=init_camera, transpose=True, normalize=True, presampled_goals=presampled_goals, **variant.get('image_env_kwargs', {})) vae_env = VAEWrappedEnv(image_env, vae, imsize=image_env.imsize, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, presampled_goals=presampled_goals, **variant.get('vae_wrapped_env_kwargs', {})) print("Presampling all goals only") else: vae_env = VAEWrappedEnv(image_env, vae, imsize=image_env.imsize, decode_goals=render, render_goals=render, render_rollouts=render, reward_params=reward_params, **variant.get('vae_wrapped_env_kwargs', {})) if presample_image_goals_only: presampled_goals = variant['generate_goal_dataset_fctn']( image_env=vae_env.wrapped_env, **variant['goal_generation_kwargs']) image_env.set_presampled_goals(presampled_goals) print("Presampling image goals only") else: print("Not using presampled goals") env = vae_env return env
def generate_vae_dataset(variant): print(variant) from tqdm import tqdm env_class = variant.get('env_class', None) env_kwargs = variant.get('env_kwargs', None) env_id = variant.get('env_id', None) N = variant.get('N', 10000) batch_size = variant.get('batch_size', 128) test_p = variant.get('test_p', 0.9) use_cached = variant.get('use_cached', True) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) show = variant.get('show', False) init_camera = variant.get('init_camera', None) dataset_path = variant.get('dataset_path', None) augment_data = variant.get('augment_data', False) data_filter_fn = variant.get('data_filter_fn', lambda x: x) delete_after_loading = variant.get('delete_after_loading', False) oracle_dataset_using_set_to_goal = variant.get( 'oracle_dataset_using_set_to_goal', False) random_rollout_data = variant.get('random_rollout_data', False) random_rollout_data_set_to_goal = variant.get( 'random_rollout_data_set_to_goal', True) random_and_oracle_policy_data = variant.get( 'random_and_oracle_policy_data', False) random_and_oracle_policy_data_split = variant.get( 'random_and_oracle_policy_data_split', 0) policy_file = variant.get('policy_file', None) n_random_steps = variant.get('n_random_steps', 100) vae_dataset_specific_env_kwargs = variant.get( 'vae_dataset_specific_env_kwargs', None) save_file_prefix = variant.get('save_file_prefix', None) non_presampled_goal_img_is_garbage = variant.get( 'non_presampled_goal_img_is_garbage', None) conditional_vae_dataset = variant.get('conditional_vae_dataset', False) use_env_labels = variant.get('use_env_labels', False) use_linear_dynamics = variant.get('use_linear_dynamics', False) enviorment_dataset = variant.get('enviorment_dataset', False) save_trajectories = variant.get('save_trajectories', False) save_trajectories = save_trajectories or use_linear_dynamics or conditional_vae_dataset tag = variant.get('tag', '') assert N % n_random_steps == 0, "Fix N/horizon or dataset generation will fail" from multiworld.core.image_env import ImageEnv, unormalize_image import rlkit.torch.pytorch_util as ptu from rlkit.util.io import load_local_or_remote_file from rlkit.data_management.dataset import ( TrajectoryDataset, ImageObservationDataset, InitialObservationDataset, EnvironmentDataset, ConditionalDynamicsDataset, InitialObservationNumpyDataset, InfiniteBatchLoader, InitialObservationNumpyJitteringDataset) info = {} use_test_dataset = False if dataset_path is not None: if type(dataset_path) == str: dataset = load_local_or_remote_file( dataset_path, delete_after_loading=delete_after_loading) dataset = dataset.item() N = dataset['observations'].shape[0] * dataset[ 'observations'].shape[1] n_random_steps = dataset['observations'].shape[1] if isinstance(dataset_path, list): dataset = concatenate_datasets(dataset_path) N = dataset['observations'].shape[0] * dataset[ 'observations'].shape[1] n_random_steps = dataset['observations'].shape[1] if isinstance(dataset_path, dict): if type(dataset_path['train']) == str: dataset = load_local_or_remote_file( dataset_path['train'], delete_after_loading=delete_after_loading) dataset = dataset.item() elif isinstance(dataset_path['train'], list): dataset = concatenate_datasets(dataset_path['train']) if type(dataset_path['test']) == str: test_dataset = load_local_or_remote_file( dataset_path['test'], delete_after_loading=delete_after_loading) test_dataset = test_dataset.item() elif isinstance(dataset_path['test'], list): test_dataset = concatenate_datasets(dataset_path['test']) N = dataset['observations'].shape[0] * dataset[ 'observations'].shape[1] n_random_steps = dataset['observations'].shape[1] use_test_dataset = True else: if env_kwargs is None: env_kwargs = {} if save_file_prefix is None: save_file_prefix = env_id if save_file_prefix is None: save_file_prefix = env_class.__name__ filename = "/tmp/{}_N{}_{}_imsize{}_random_oracle_split_{}{}.npy".format( save_file_prefix, str(N), init_camera.__name__ if init_camera and hasattr(init_camera, '__name__') else '', imsize, random_and_oracle_policy_data_split, tag, ) if use_cached and osp.isfile(filename): dataset = load_local_or_remote_file( filename, delete_after_loading=delete_after_loading) if conditional_vae_dataset: dataset = dataset.item() print("loaded data from saved file", filename) else: now = time.time() if env_id is not None: import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) else: if vae_dataset_specific_env_kwargs is None: vae_dataset_specific_env_kwargs = {} for key, val in env_kwargs.items(): if key not in vae_dataset_specific_env_kwargs: vae_dataset_specific_env_kwargs[key] = val env = env_class(**vae_dataset_specific_env_kwargs) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, non_presampled_goal_img_is_garbage= non_presampled_goal_img_is_garbage, ) else: imsize = env.imsize env.non_presampled_goal_img_is_garbage = non_presampled_goal_img_is_garbage env.reset() info['env'] = env if random_and_oracle_policy_data: policy_file = load_local_or_remote_file(policy_file) policy = policy_file['policy'] policy.to(ptu.device) if random_rollout_data: from rlkit.exploration_strategies.ou_strategy import OUStrategy policy = OUStrategy(env.action_space) if save_trajectories: dataset = { 'observations': np.zeros((N // n_random_steps, n_random_steps, imsize * imsize * num_channels), dtype=np.uint8), 'actions': np.zeros((N // n_random_steps, n_random_steps, env.action_space.shape[0]), dtype=np.float), 'env': np.zeros( (N // n_random_steps, imsize * imsize * num_channels), dtype=np.uint8), } else: dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) labels = [] for i in tqdm(range(N)): if random_and_oracle_policy_data: num_random_steps = int(N * random_and_oracle_policy_data_split) if i < num_random_steps: env.reset() for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] else: obs = env.reset() policy.reset() for _ in range(n_random_steps): policy_obs = np.hstack(( obs['state_observation'], obs['state_desired_goal'], )) action, _ = policy.get_action(policy_obs) obs, _, _, _ = env.step(action) elif random_rollout_data: #ADD DATA WHERE JUST PUCK MOVES if i % n_random_steps == 0: env.reset() policy.reset() env_img = env._get_obs()['image_observation'] if random_rollout_data_set_to_goal: env.set_to_goal(env.get_goal()) obs = env._get_obs() u = policy.get_action_from_raw_action( env.action_space.sample()) env.step(u) elif oracle_dataset_using_set_to_goal: print(i) goal = env.sample_goal() env.set_to_goal(goal) obs = env._get_obs() else: env.reset() for _ in range(n_random_steps): obs = env.step(env.action_space.sample())[0] img = obs['image_observation'] if use_env_labels: labels.append(obs['label']) if save_trajectories: dataset['observations'][ i // n_random_steps, i % n_random_steps, :] = unormalize_image(img) dataset['actions'][i // n_random_steps, i % n_random_steps, :] = u dataset['env'][i // n_random_steps, :] = unormalize_image( env_img) else: dataset[i, :] = unormalize_image(img) if show: img = img.reshape(3, imsize, imsize).transpose() img = img[::-1, :, ::-1] cv2.imshow('img', img) cv2.waitKey(1) # radius = input('waiting...') print("done making training data", filename, time.time() - now) np.save(filename, dataset) #np.save(filename[:-4] + 'labels.npy', np.array(labels)) info['train_labels'] = [] info['test_labels'] = [] dataset = data_filter_fn(dataset) if use_linear_dynamics and conditional_vae_dataset: num_trajectories = N // n_random_steps n = int(num_trajectories * test_p) train_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][:n, :, :], 'actions': dataset['actions'][:n, :, :], 'env': dataset['env'][:n, :] }) test_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][n:, :, :], 'actions': dataset['actions'][n:, :, :], 'env': dataset['env'][n:, :] }) num_trajectories = N // n_random_steps n = int(num_trajectories * test_p) indices = np.arange(num_trajectories) np.random.shuffle(indices) train_i, test_i = indices[:n], indices[n:] try: train_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][train_i, :, :], 'actions': dataset['actions'][train_i, :, :], 'env': dataset['env'][train_i, :] }) test_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][test_i, :, :], 'actions': dataset['actions'][test_i, :, :], 'env': dataset['env'][test_i, :] }) except: train_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][train_i, :, :], 'actions': dataset['actions'][train_i, :, :], }) test_dataset = ConditionalDynamicsDataset({ 'observations': dataset['observations'][test_i, :, :], 'actions': dataset['actions'][test_i, :, :], }) elif use_linear_dynamics: num_trajectories = N // n_random_steps n = int(num_trajectories * test_p) train_dataset = TrajectoryDataset({ 'observations': dataset['observations'][:n, :, :], 'actions': dataset['actions'][:n, :, :] }) test_dataset = TrajectoryDataset({ 'observations': dataset['observations'][n:, :, :], 'actions': dataset['actions'][n:, :, :] }) elif enviorment_dataset: n = int(n_random_steps * test_p) train_dataset = EnvironmentDataset({ 'observations': dataset['observations'][:, :n, :], }) test_dataset = EnvironmentDataset({ 'observations': dataset['observations'][:, n:, :], }) elif conditional_vae_dataset: num_trajectories = N // n_random_steps n = int(num_trajectories * test_p) indices = np.arange(num_trajectories) np.random.shuffle(indices) train_i, test_i = indices[:n], indices[n:] if augment_data: dataset_class = InitialObservationNumpyJitteringDataset else: dataset_class = InitialObservationNumpyDataset if 'env' not in dataset: dataset['env'] = dataset['observations'][:, 0] if use_test_dataset and ('env' not in test_dataset): test_dataset['env'] = test_dataset['observations'][:, 0] if use_test_dataset: train_dataset = dataset_class({ 'observations': dataset['observations'], 'env': dataset['env'] }) test_dataset = dataset_class({ 'observations': test_dataset['observations'], 'env': test_dataset['env'] }) else: train_dataset = dataset_class({ 'observations': dataset['observations'][train_i, :, :], 'env': dataset['env'][train_i, :] }) test_dataset = dataset_class({ 'observations': dataset['observations'][test_i, :, :], 'env': dataset['env'][test_i, :] }) train_batch_loader_kwargs = variant.get( 'train_batch_loader_kwargs', dict( batch_size=batch_size, num_workers=0, )) test_batch_loader_kwargs = variant.get( 'test_batch_loader_kwargs', dict( batch_size=batch_size, num_workers=0, )) train_data_loader = data.DataLoader(train_dataset, shuffle=True, drop_last=True, **train_batch_loader_kwargs) test_data_loader = data.DataLoader(test_dataset, shuffle=True, drop_last=True, **test_batch_loader_kwargs) train_dataset = InfiniteBatchLoader(train_data_loader) test_dataset = InfiniteBatchLoader(test_data_loader) else: n = int(N * test_p) train_dataset = ImageObservationDataset(dataset[:n, :]) test_dataset = ImageObservationDataset(dataset[n:, :]) return train_dataset, test_dataset, info
def experiment(variant): import multiworld multiworld.register_all_envs() eval_env = gym.make('SawyerPushXYZEnv-v0') expl_env = gym.make('SawyerPushXYZEnv-v0') observation_key = 'state_observation' desired_goal_key = 'state_desired_goal' achieved_goal_key = desired_goal_key.replace("desired", "achieved") es = GaussianAndEpislonStrategy( action_space=expl_env.action_space, max_sigma=.2, min_sigma=.2, # constant sigma epsilon=.3, ) obs_dim = expl_env.observation_space.spaces['observation'].low.size goal_dim = expl_env.observation_space.spaces['desired_goal'].low.size action_dim = expl_env.action_space.low.size qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf1 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) target_qf2 = FlattenMlp( input_size=obs_dim + goal_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_policy = TanhMlpPolicy( input_size=obs_dim + goal_dim, output_size=action_dim, **variant['policy_kwargs'] ) expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=es, policy=policy, ) replay_buffer = ObsDictRelabelingBuffer( env=eval_env, observation_key=observation_key, desired_goal_key=desired_goal_key, achieved_goal_key=achieved_goal_key, **variant['replay_buffer_kwargs'] ) trainer = TD3Trainer( policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, target_policy=target_policy, **variant['trainer_kwargs'] ) trainer = HERTrainer(trainer) eval_path_collector = GoalConditionedPathCollector( eval_env, policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) expl_path_collector = GoalConditionedPathCollector( expl_env, expl_policy, observation_key=observation_key, desired_goal_key=desired_goal_key, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algo_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def generate_sawyerhurdle_dataset(variant, segmented=False, segmentation_method='unet'): from multiworld.core.image_env import ImageEnv, unormalize_image env_id = variant.get('env_id', None) N = variant.get('N', 10000) test_p = variant.get('test_p', 0.9) imsize = variant.get('imsize', 84) num_channels = variant.get('num_channels', 3) init_camera = variant.get('init_camera', None) segmentation_kwargs = variant.get('segmentation_kwargs', {}) pjhome = os.environ['PJHOME'] seg_name = 'seg-' + segmentation_method if segmented else 'no-seg' data_file_path = osp.join(pjhome, 'data/local/pre-train-vae', '{}-{}-{}.npy'.format(env_id, seg_name, N)) puck_pos_path = osp.join( pjhome, 'data/local/pre-train-vae', '{}-{}-{}-puck-pos.npy'.format(env_id, seg_name, N)) if osp.exists(data_file_path): all_data = np.load(data_file_path) if len(all_data) >= N: print("load stored data at: ", data_file_path) n = int(len(all_data) * test_p) train_dataset = all_data[:n] test_dataset = all_data[n:] puck_pos = np.load(puck_pos_path) info = {'puck_pos': puck_pos} return train_dataset, test_dataset, info if segmented: print("generating vae dataset with segmented images using method: ", segmentation_method) if segmentation_method == 'unet': segment_func = segment_image_unet else: raise NotImplementedError else: print("generating vae dataset with original images") assert env_id is not None import gym import multiworld multiworld.register_all_envs() env = gym.make(env_id) if not isinstance(env, ImageEnv): env = ImageEnv( env, imsize, init_camera=init_camera, transpose=True, normalize=True, ) info = {} env.reset() info['env'] = env dataset = np.zeros((N, imsize * imsize * num_channels), dtype=np.uint8) puck_pos = np.zeros((N, 2), dtype=np.float) for i in range(N): print("sawyer hurdle custom vae data set generation, number: ", i) if env_id == 'SawyerPushHurdle-v0': obs, puck_p = _generate_sawyerhurdle_dataset(env, return_puck_pos=True) elif env_id == 'SawyerPushHurdleMiddle-v0': obs, puck_p = _generate_sawyerhurdlemiddle_dataset( env, return_puck_pos=True) else: raise NotImplementedError img = obs[ 'image_observation'] # NOTE yufei: this is already normalized image, of detype np.float64. if segmented: dataset[i, :] = segment_func(img, normalize=False, **segmentation_kwargs) else: dataset[i, :] = unormalize_image(img) puck_pos[i] = puck_p n = int(N * test_p) train_dataset = dataset[:n, :] test_dataset = dataset[n:, :] info['puck_pos'] = puck_pos if N >= 2000: print('save data to: ', data_file_path) all_data = np.concatenate([train_dataset, test_dataset], axis=0) np.save(data_file_path, all_data) np.save(puck_pos_path, puck_pos) return train_dataset, test_dataset, info