def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer( env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': env = normalize(SwimmerEnv()) elif variant['env_name'] == 'ant-rllab': env = normalize(AntEnv()) else: env = normalize(GymEnv(variant['env_name'])) pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) M = variant['layer_size'] qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=(M, M)) policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=False) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'pusher': # TODO: assumes `pusher.xml` is located in `rllab/models/` when # running on EC2. env = normalize(PusherEnv(goal=variant.get('goal'))) else: raise ValueError pool = SimpleReplayBuffer( env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler( max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) task_id = abs(pickle.dumps(variant).__hash__()) M = variant['layer_size'] qf = NNQFunction( env_spec=spec(env), hidden_layer_sizes=(M, M), name='qf_{i}'.format(i=task_id)) policy = StochasticNNPolicy( env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy_{i}'.format(i=task_id)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=variant['save_full_state']) algorithm.train()
def run_experiment(variant): if variant['env_name'] == 'pusher': # TODO: assumes `pusher.xml` is located in `rllab/models/` when # running on EC2. env = normalize(PusherEnv(goal=variant.get('goal'))) else: raise ValueError pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=variant['max_pool_size']) sampler = SimpleSampler(max_path_length=variant['max_path_length'], min_pool_size=variant['max_path_length'], batch_size=variant['batch_size']) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, sampler=sampler) task_id = abs(pickle.dumps(variant).__hash__()) M = variant['layer_size'] qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=(M, M), name='qf_{i}'.format(i=task_id)) policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy_{i}'.format(i=task_id)) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], value_n_particles=variant['value_n_particles'], td_target_update_interval=variant['td_target_update_interval'], qf_lr=variant['qf_lr'], policy_lr=variant['policy_lr'], discount=variant['discount'], reward_scale=variant['reward_scale'], save_full_state=variant['save_full_state']) algorithm.train()
def run_task(snapshot_config, *_): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: with tf.variable_scope('AST', reuse=tf.AUTO_REUSE): with LocalTFRunner( snapshot_config=snapshot_config, max_cpus=4, sess=sess) as local_runner: # Instantiate the example classes sim = ExampleAVSimulator(**sim_args) reward_function = ExampleAVReward(**reward_args) spaces = ExampleAVSpaces(**spaces_args) # Create the environment if 'id' in env_args: env_args.pop('id') env = TfEnv(normalize(ASTEnv(simulator=sim, reward_function=reward_function, spaces=spaces, **env_args ))) # Instantiate the garage objects policy = GaussianLSTMPolicy(env_spec=env.spec, **policy_args) baseline = LinearFeatureBaseline(env_spec=env.spec, **baseline_args) optimizer = ConjugateGradientOptimizer optimizer_args = {'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5)} algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, optimizer=optimizer, optimizer_args=optimizer_args, **algo_args) sampler_cls = ASTVectorizedSampler local_runner.setup( algo=algo, env=env, sampler_cls=sampler_cls, sampler_args={"open_loop": False, "sim": sim, "reward_function": reward_function, 'n_envs': n_parallel}) # Run the experiment local_runner.train(**runner_args)
def test(): env = normalize(MultiGoalEnv()) pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6) sampler = SimpleSampler(max_path_length=30, min_pool_size=100, batch_size=64) base_kwargs = { 'sampler': sampler, 'epoch_length': 100, 'n_epochs': 1000, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } M = 128 policy = StochasticNNPolicy(spec(env), hidden_layer_sizes=(M, M), squash=True) qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=[M, M]) plotter = QFPolicyPlotter(qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SQL(base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, plotter=plotter, policy_lr=3e-4, qf_lr=3e-4, value_n_particles=16, td_target_update_interval=1000, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, discount=0.99, reward_scale=0.1, save_full_state=False) algorithm.train()
def test(): env = normalize(MultiGoalEnv()) pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6) sampler = SimpleSampler( max_path_length=30, min_pool_size=100, batch_size=64) base_kwargs = { 'sampler': sampler, 'epoch_length': 100, 'n_epochs': 1000, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } M = 128 policy = StochasticNNPolicy( spec(env), hidden_layer_sizes=(M, M), squash=True) qf = NNQFunction(env_spec=spec(env), hidden_layer_sizes=[M, M]) plotter = QFPolicyPlotter( qf=qf, policy=policy, obs_lst=np.array([[-2.5, 0.0], [0.0, 0.0], [2.5, 2.5]]), default_action=[np.nan, np.nan], n_samples=100) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=qf, policy=policy, plotter=plotter, policy_lr=3e-4, qf_lr=3e-4, value_n_particles=16, td_target_update_interval=1000, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=32, kernel_update_ratio=0.5, discount=0.99, reward_scale=0.1, save_full_state=False) algorithm.train()
def run_experiment(variant): env = normalize(SwimmerEnv()) pool = SimpleReplayBuffer( env_spec=spec(env), max_replay_buffer_size=1e6) sampler = SimpleSampler( max_path_length=1000, min_pool_size=1000, batch_size=128) base_kwargs = dict( epoch_length=1000, n_epochs=500, n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) with tf.Session().as_default(): data = joblib.load(variant['file']) if 'algo' in data.keys(): saved_qf = data['algo'].qf saved_policy = data['algo'].policy else: saved_qf = data['qf'] saved_policy = data['policy'] algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=pool, qf=saved_qf, policy=saved_policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=3E-4, policy_lr=3E-4, discount=0.99, reward_scale=30, use_saved_qf=True, use_saved_policy=True, save_full_state=False) algorithm.train()
def run_task(snapshot_config, *_): with LocalTFRunner(snapshot_config=snapshot_config, max_cpus=1) as runner: # Instantiate the example classes sim = ExampleAVSimulator() reward_function = ExampleAVReward() spaces = ExampleAVSpaces() # Create the environment env = TfEnv( normalize( ASTEnv(blackbox_sim_state=True, fixed_init_state=True, s_0=[-0.5, -4.0, 1.0, 11.17, -35.0], simulator=sim, reward_function=reward_function, spaces=spaces))) # Instantiate the garage objects policy = GaussianLSTMPolicy(name='lstm_policy', env_spec=env.spec, hidden_dim=64) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = TRPO(env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=0.99, kl_constraint='soft', max_kl_step=0.01) sampler_cls = ASTVectorizedSampler runner.setup(algo=algo, env=env, sampler_cls=sampler_cls, sampler_args={ "sim": sim, "reward_function": reward_function }) runner.train(n_epochs=1, batch_size=4000, plot=False) print("Installation successfully validated")
def run_experiment(variant): env = normalize(SwimmerEnv()) pool = SimpleReplayBuffer(env_spec=spec(env), max_replay_buffer_size=1e6) sampler = SimpleSampler(max_path_length=1000, min_pool_size=1000, batch_size=128) base_kwargs = dict(epoch_length=1000, n_epochs=500, n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) with tf.Session().as_default(): data = joblib.load(variant['file']) if 'algo' in data.keys(): saved_qf = data['algo'].qf saved_policy = data['algo'].policy else: saved_qf = data['qf'] saved_policy = data['policy'] algorithm = SQL(base_kwargs=base_kwargs, env=env, pool=pool, qf=saved_qf, policy=saved_policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, value_n_particles=16, td_target_update_interval=1000, qf_lr=3E-4, policy_lr=3E-4, discount=0.99, reward_scale=30, use_saved_qf=True, use_saved_policy=True, save_full_state=False) algorithm.train()
def run_experiment(variant): env = normalize(PusherEnv(goal=variant.get('goal'))) buffer1, qf1 = load_buffer_and_qf(variant['snapshot1']) buffer2, qf2 = load_buffer_and_qf(variant['snapshot2']) sampler = DummySampler( batch_size=variant['batch_size'], max_path_length=variant['max_path_length']) buffer = UnionBuffer(buffers=(buffer1, buffer2)) qf = SumQFunction(spec(env), q_functions=(qf1, qf2)) M = variant['layer_size'] policy = StochasticNNPolicy( env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy{i}'.format(i=0)) base_kwargs = dict( epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) algorithm = SQL( base_kwargs=base_kwargs, env=env, pool=buffer, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], policy_lr=variant['policy_lr'], save_full_state=False, train_policy=True, train_qf=False, use_saved_qf=True) algorithm.train()
def run_experiment(variant): env = normalize(PusherEnv(goal=variant.get('goal'))) buffer1, qf1 = load_buffer_and_qf(variant['snapshot1']) buffer2, qf2 = load_buffer_and_qf(variant['snapshot2']) sampler = DummySampler(batch_size=variant['batch_size'], max_path_length=variant['max_path_length']) buffer = UnionBuffer(buffers=(buffer1, buffer2)) qf = SumQFunction(spec(env), q_functions=(qf1, qf2)) M = variant['layer_size'] policy = StochasticNNPolicy(env_spec=spec(env), hidden_layer_sizes=(M, M), name='policy{i}'.format(i=0)) base_kwargs = dict(epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], n_train_repeat=1, eval_render=False, eval_n_episodes=1, sampler=sampler) algorithm = SQL(base_kwargs=base_kwargs, env=env, pool=buffer, qf=qf, policy=policy, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=variant['kernel_particles'], kernel_update_ratio=variant['kernel_update_ratio'], policy_lr=variant['policy_lr'], save_full_state=False, train_policy=True, train_qf=False, use_saved_qf=True) algorithm.train()
def test_can_create_env(self): # Fixes https://github.com/rlworkgroup/garage/pull/420 env = normalize(SwimmerEnv()) assert env
def run_task(snapshot_config, *_): config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: with tf.variable_scope('AST', reuse=tf.AUTO_REUSE): with LocalTFRunner(snapshot_config=snapshot_config, max_cpus=4, sess=sess) as local_runner: # Instantiate the example classes sim = ExampleAVSimulator(**sim_args) reward_function = ExampleAVReward(**reward_args) spaces = ExampleAVSpaces(**spaces_args) # Create the environment if 'id' in env_args: env_args.pop('id') env = TfEnv( normalize( ASTEnv(simulator=sim, reward_function=reward_function, spaces=spaces, **env_args))) # Instantiate the garage objects policy = GaussianLSTMPolicy(env_spec=env.spec, **policy_args) baseline = LinearFeatureBaseline(env_spec=env.spec, **baseline_args) optimizer = ConjugateGradientOptimizer optimizer_args = { 'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5) } algo = PPO(env_spec=env.spec, policy=policy, baseline=baseline, optimizer=optimizer, optimizer_args=optimizer_args, **algo_args) sampler_cls = ASTVectorizedSampler sampler_args['sim'] = sim sampler_args['reward_function'] = reward_function local_runner.setup(algo=algo, env=env, sampler_cls=sampler_cls, sampler_args=sampler_args) # Run the experiment local_runner.train(**runner_args) if save_expert_trajectory: load_convert_and_save_drl_expert_trajectory( last_iter_filename=os.path.join( run_experiment_args['log_dir'], 'itr_' + str(runner_args['n_epochs'] - 1) + '.pkl'), expert_trajectory_filename=os.path.join( run_experiment_args['log_dir'], 'expert_trajectory.pkl')) print('done!')
def run_task(snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}) # config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: with tf.variable_scope('AST', reuse=tf.AUTO_REUSE): with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as runner: # Instantiate the example classes # sim = ExampleAVSimulator() g = 9.8 # acceleration due to gravity # this is y lat_params = rss.LateralParams( 0, # ρ 0.1 * g, # a_lat_max_acc 0.05 * g, # a_lat_min_brake 1.4 # Buffer distance ) # this is x long_params = rss.LongitudinalParams( 0, # ρ 0.7 * g, # a_max_brake 0.1 * g, # a_max_acc 0.7 * g, # a_min_brake1 0.7 * g, # a_min_brake2 2.5, # Buffer ) sim = AVRSSSimulator(lat_params, long_params) reward_function = HeuristicReward( PedestrianNoiseGaussian(1, 1, 0.2, .01), np.array([-10000, -1000, 0])) # reward_function = ExampleAVReward() spaces = ExampleAVSpaces() # Create the environment # env1 = GoExploreASTEnv(open_loop=False, # blackbox_sim_state=True, # fixed_init_state=True, # s_0=[-0.5, -4.0, 1.0, 11.17, -35.0], # simulator=sim, # reward_function=reward_function, # spaces=spaces s_0 = [-1.0, -2.0, 1.0, 11.17, -35.0] # ) env1 = gym.make('ast_toolbox:GoExploreAST-v1', open_loop=False, action_only=True, fixed_init_state=True, s_0=s_0, simulator=sim, reward_function=reward_function, spaces=spaces) env2 = normalize(env1) env = TfEnv(env2) # Instantiate the garage objects policy = GoExplorePolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec) algo = GoExplore( db_filename=db_filename, max_db_size=max_db_size, env=env, env_spec=env.spec, policy=policy, baseline=baseline, max_path_length=max_path_length, discount=discount, # whole_paths=whole_paths ) sampler_cls = BatchSampler sampler_args = {'n_envs': n_parallel} runner.setup(algo=algo, env=env, sampler_cls=sampler_cls, sampler_args=sampler_args) # runner.setup( # algo=algo, # env=env, # sampler_cls=sampler_cls, # sampler_args={"sim": sim, # "reward_function": reward_function}) # Run the experiment paths = runner.train(n_epochs=n_itr, batch_size=batch_size, plot=False) print(paths) best_traj = paths.trajectory * np.array([ 1, 1 / 1000, 1 / 1000, 1 / 1000, 1 / 1000, 1 / 1000, 1 / 1000 ]) peds = sim._peds car = np.expand_dims(sim._car, axis=0) car_obs = sim._car_obs for step in range(best_traj.shape[0]): sim.step(action=best_traj[step, 1:], open_loop=False) peds = np.concatenate((peds, sim._peds), axis=0) car = np.concatenate( (car, np.expand_dims(sim._car, axis=0)), axis=0) car_obs = np.concatenate((car_obs, sim._car_obs), axis=0) import matplotlib.pyplot as plt plt.scatter(car[:, 2], car[:, 3]) plt.scatter(peds[:, 2], peds[:, 3]) plt.scatter(car_obs[:, 2], car_obs[:, 3]) pdb.set_trace() print('done!')
def run_task(snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}) # config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: with tf.variable_scope('AST', reuse=tf.AUTO_REUSE): # Instantiate the example classes sim = ExampleAVSimulator(**sim_args) reward_function = ExampleAVReward(**reward_args) spaces = ExampleAVSpaces(**spaces_args) # Create the environment # env1 = GoExploreASTEnv(open_loop=False, # blackbox_sim_state=True, # fixed_init_state=True, # s_0=[-0.5, -4.0, 1.0, 11.17, -35.0], # simulator=sim, # reward_function=reward_function, # spaces=spaces # ) env1 = gym.make(id=env_args.pop('id'), simulator=sim, reward_function=reward_function, spaces=spaces, **env_args) env2 = normalize(env1) env = TfEnv(env2) sampler_cls = BatchSampler # sampler_args = {'n_envs': n_parallel} sampler_args = {} # expert_trajectory_file = log_dir + '/expert_trajectory.p' # with open(expert_trajectory_file, 'rb') as f: # expert_trajectory = pickle.load(f) # # #Run backwards algorithm to robustify with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as local_runner: policy = GaussianLSTMPolicy(env_spec=env.spec, **policy_args) # name='lstm_policy', # env_spec=env.spec, # hidden_dim=64, # use_peepholes=True) baseline = LinearFeatureBaseline(env_spec=env.spec, **baseline_args) optimizer = ConjugateGradientOptimizer optimizer_args = { 'hvp_approach': FiniteDifferenceHvp(base_eps=1e-5) } algo = BackwardAlgorithm(env=env, env_spec=env.spec, policy=policy, baseline=baseline, optimizer=optimizer, optimizer_args=optimizer_args, **algo_args) # expert_trajectory=expert_trajectory[-1], # epochs_per_step = 10, # scope=None, # max_path_length=max_path_length, # discount=discount, # gae_lambda=1, # center_adv=True, # positive_adv=False, # fixed_horizon=False, # pg_loss='surrogate_clip', # lr_clip_range=1.0, # max_kl_step=1.0, # policy_ent_coeff=0.0, # use_softplus_entropy=False, # use_neg_logli_entropy=False, # stop_entropy_gradient=False, # entropy_method='no_entropy', # name='PPO', # ) local_runner.setup(algo=algo, env=env, sampler_cls=sampler_cls, sampler_args=sampler_args) results = local_runner.train(**runner_args) # pdb.set_trace() print('done') log_dir = run_experiment_args['log_dir'] with open(log_dir + '/paths.gz', 'wb') as f: try: compress_pickle.dump(results, f, compression="gzip", set_default_extension=False) except MemoryError: print('1') # pdb.set_trace() for idx, result in enumerate(results): with open( log_dir + '/path_' + str(idx) + '.gz', 'wb') as ff: try: compress_pickle.dump( result, ff, compression="gzip", set_default_extension=False) except MemoryError: print('2')
def run_task(snapshot_config, *_): config = tf.ConfigProto(device_count={'GPU': 0}) # config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: with tf.variable_scope('AST', reuse=tf.AUTO_REUSE): # Instantiate the example classes sim = ExampleAVSimulator(**sim_args) # blackbox_sim_state=True, # open_loop=False, # fixed_initial_state=True, # max_path_length=max_path_length) reward_function = ExampleAVReward(**reward_args) spaces = ExampleAVSpaces(**spaces_args) # Create the environment # env1 = GoExploreASTEnv(open_loop=False, # blackbox_sim_state=True, # fixed_init_state=True, # s_0=[-0.5, -4.0, 1.0, 11.17, -35.0], # simulator=sim, # reward_function=reward_function, # spaces=spaces # ) # env1 = gym.make('ast_toolbox:GoExploreAST-v1', # blackbox_sim_state=True, # open_loop=False, # fixed_init_state=True, # s_0=s_0, # simulator=sim, # reward_function=reward_function, # spaces=spaces # ) env1 = gym.make(id=env_args.pop('id'), simulator=sim, reward_function=reward_function, spaces=spaces, **env_args) env2 = normalize(env1) env = TfEnv(env2) # Instantiate the garage objects policy = GoExplorePolicy(env_spec=env.spec) baseline = LinearFeatureBaseline(env_spec=env.spec, **baseline_args) algo = GoExplore(env_spec=env.spec, env=env, policy=policy, baseline=baseline, **algo_args) # db_filename=db_filename, # max_db_size=max_db_size, # env=env, # # policy=policy, # baseline=baseline, # # robust_policy=robust_policy, # # robust_baseline=robust_baseline, # max_path_length=max_path_length, # discount=discount, # save_paths_gap=1, # save_paths_path=log_dir, # # whole_paths=whole_paths # ) sampler_cls = BatchSampler # sampler_args = {'n_envs': n_parallel} sampler_args = {} with LocalTFRunner(snapshot_config=snapshot_config, sess=sess) as local_runner: local_runner.setup(algo=algo, env=env, sampler_cls=sampler_cls, sampler_args=sampler_args) # local_runner.setup( # algo=algo, # env=env, # sampler_cls=sampler_cls, # sampler_args={"sim": sim, # "reward_function": reward_function}) # Run the experiment best_cell = local_runner.train( **runner_args ) # n_epochs=n_itr, batch_size=batch_size, plot=False) log_dir = run_experiment_args['log_dir'] db_filename = algo_args['db_filename'] s_0 = env_args['s_0'] pool_DB = db.DB() pool_DB.open(db_filename + '_pool.dat', dbname=None, dbtype=db.DB_HASH, flags=db.DB_CREATE) d_pool = shelve.Shelf(pool_DB, protocol=pickle.HIGHEST_PROTOCOL) # pdb.set_trace() print(best_cell) temp = best_cell paths = [] while (temp.parent is not None): print(temp.observation) action = temp.observation[1:].astype(np.float32) / 1000 paths.append({ 'state': temp.state, 'reward': temp.reward, 'action': action, 'observation': np.array(s_0) }) temp = d_pool[temp.parent] print(temp.observation) paths.append({ 'state': temp.state, 'reward': temp.reward, 'action': action, 'observation': np.array(s_0) }) # pdb.set_trace() d_pool.close() with open(log_dir + '/expert_trajectory.p', 'wb') as f: pickle.dump([paths], f) print('done!')