def run_ars(params): dir_path = params['dir_path'] if not(os.path.exists(dir_path)): os.makedirs(dir_path) logdir = dir_path if not(os.path.exists(logdir)): os.makedirs(logdir) #directory for saving videos (for colab) monitor_dir = os.path.join(dir_path, 'monitor') if not os.path.exists(monitor_dir): os.makedirs(monitor_dir) # INSERT laikago env here env = env_builder.build_imitation_env(motion_files=[params['motion_file']], num_parallel_envs=1, mode='train', enable_randomizer=False, enable_rendering=params['visualize'], action_lim=params['actionlim'], curr_steps=params['currsteps'], path=params['path']) # env = gym.make(params['env_name']) # env = wrappers.Monitor(env, monitor_dir, force=True) ob_dim = env.observation_space.shape[0] #should be 4+4+12+33 ob_dim_h = 4*3 ac_dim = env.action_space.shape[0] #should be 12+33 # set policy parameters. Possible filters: 'MeanStdFilter' for v2, 'NoFilter' for v1. policy_params={'type':params['policy_type'], 'ob_filter':params['filter'], 'ob_dim':ob_dim, 'ac_dim':ac_dim, 'history_size':3, 'latent_dim':2, 'ob_h_dim':ob_dim_h, 'ob_l_dim':ob_dim-ob_dim_h} ARS = ARSLearner(env_name=params['env_name'], policy_params=policy_params, num_workers=params['n_workers'], num_deltas=params['n_directions'], deltas_used=params['deltas_used'], step_size=params['step_size'], delta_std=params['delta_std'], logdir=logdir, rollout_length=params['rollout_length'], shift=params['shift'], params=params, seed = params['seed']) ARS.train(params['n_iter']) return
def main(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument("--seed", dest="seed", type=int, default=None) arg_parser.add_argument("--mode", dest="mode", type=str, default="train") arg_parser.add_argument("--motion_file", dest="motion_file", type=str, default="motion_imitation/data/motions/laikago_dog_pace.txt") arg_parser.add_argument("--visualize", dest="visualize", action="store_true", default=False) arg_parser.add_argument("--output_dir", dest="output_dir", type=str, default="output") arg_parser.add_argument("--num_test_episodes", dest="num_test_episodes", type=int, default=None) arg_parser.add_argument("--model_file", dest="model_file", type=str, default="") arg_parser.add_argument("--total_timesteps", dest="total_timesteps", type=int, default=2e8) arg_parser.add_argument("--int_save_freq", dest="int_save_freq", type=int, default=0) # save intermediate model every n policy steps args = arg_parser.parse_args() num_procs = MPI.COMM_WORLD.Get_size() os.environ["CUDA_VISIBLE_DEVICES"] = '-1' enable_env_rand = ENABLE_ENV_RANDOMIZER and (args.mode != "test") env = env_builder.build_imitation_env(motion_files=[args.motion_file], num_parallel_envs=num_procs, mode=args.mode, enable_randomizer=enable_env_rand, enable_rendering=args.visualize) model = build_model(env=env, num_procs=num_procs, timesteps_per_actorbatch=TIMESTEPS_PER_ACTORBATCH, optim_batchsize=OPTIM_BATCHSIZE, output_dir=args.output_dir) if args.model_file != "": model.load_parameters(args.model_file) if args.mode == "train": train(model=model, env=env, total_timesteps=args.total_timesteps, output_dir=args.output_dir, int_save_freq=args.int_save_freq) elif args.mode == "test": test(model=model, env=env, num_procs=num_procs, num_episodes=args.num_test_episodes) else: assert False, "Unsupported mode: " + args.mode return
def __init__(self, env_seed, env_name='', policy_params = None, deltas=None, rollout_length=1000, delta_std=0.02, params = None): # initialize OpenAI environment for each worker self.env = env_builder.build_imitation_env(motion_files=[params['motion_file']], num_parallel_envs=1, mode='train', enable_randomizer=False, enable_rendering=params['visualize'], action_lim=params['actionlim'], curr_steps=params['currsteps'], path=params['path']) # self.env = gym.make(env_name) # self.env.seed(env_seed) # each worker gets access to the shared noise table # with independent random streams for sampling # from the shared noise table. self.deltas = SharedNoiseTable(deltas, env_seed + 7) self.policy_params = policy_params if policy_params['type'] == 'linear': self.policy = LinearPolicy(policy_params) elif policy_params['type'] == 'honly': self.policy = HLinearPolicyHOnly(policy_params) if params['initweights'] != None: self.policy.loadWeights(params['initweights']) else: self.policy = HLinearPolicy(policy_params) if params['initweights'] != None: self.policy.loadWeights(params['initweights']) self.delta_std = delta_std self.rollout_length = rollout_length
args = arg_parser.parse_args() hp = Hp(nb_steps=args.steps, episode_length=args.eplength, learning_rate=args.learnrate, nb_directions=args.ndirections, nb_best_directions=args.nbestdir, noise=args.noise, seed=1, latent_dim=args.latent) np.random.seed(hp.seed) env = env_builder.build_imitation_env(motion_files=[args.motion_file], num_parallel_envs=1, mode=args.mode, enable_randomizer=False, enable_rendering=args.visualize, action_lim=args.actionlim, curr_steps=0) #env = wrappers.Monitor(env, video_path, force=True) nb_inputs = env.observation_space.shape[0] nb_outputs = env.action_space.shape[0] if args.policytype == 0: policy = HPolicy(input_dim_h, nb_inputs - input_dim_h, hp.latent_dim, nb_outputs, sensor_history_num) else: policy = HPolicyhlb(input_dim_h, nb_inputs - input_dim_h, hp.latent_dim, nb_outputs, sensor_history_num, args.latentval1, args.latentval2) normalizer = Normalizer(nb_inputs)
def __init__(self, env_name='HalfCheetah-v1', policy_params=None, num_workers=32, num_deltas=320, deltas_used=320, delta_std=0.02, logdir=None, rollout_length=1000, step_size=0.01, shift='constant zero', params=None, seed=123): logz.configure_output_dir(logdir) logz.save_params(params) env = env_builder.build_imitation_env(motion_files=[params['motion_file']], num_parallel_envs=1, mode='train', enable_randomizer=False, enable_rendering=params['visualize'], action_lim=params['actionlim'], curr_steps=params['currsteps'], path=params['path']) self.timesteps = 0 self.action_size = env.action_space.shape[0] self.ob_size = env.observation_space.shape[0] self.num_deltas = num_deltas self.deltas_used = deltas_used self.rollout_length = rollout_length self.step_size = step_size self.delta_std = delta_std self.logdir = logdir self.shift = shift self.params = params self.max_past_avg_reward = float('-inf') self.num_episodes_used = float('inf') # create shared table for storing noise print("Creating deltas table.") deltas_id = create_shared_noise.remote() self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3) print('Created deltas table.') # initialize workers with different random seeds print('Initializing workers.') self.num_workers = num_workers self.workers = [Worker.remote(seed + 7 * i, env_name=env_name, policy_params=policy_params, deltas=deltas_id, rollout_length=rollout_length, delta_std=delta_std, params = params) for i in range(num_workers)] # initialize policy if policy_params['type'] == 'hlinear': self.policy = HLinearPolicy(policy_params) if params['initweights'] != None: self.policy.loadWeights(params['initweights']) self.w_policy = self.policy.get_weights() elif policy_params['type'] == 'honly': self.policy = HLinearPolicyHOnly(policy_params) if params['initweights'] != None: self.policy.loadWeights(params['initweights']) self.w_policy = self.policy.get_weights() else: raise NotImplementedError # initialize optimization algorithm self.optimizer = optimizers.SGD(self.w_policy, self.step_size) print("Initialization of ARS complete.")