def main(): if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") parser = argparse.ArgumentParser() parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained') parser.add_argument('-s', '--step_to_load', type=int, default=0, help='step checkpoint to load') parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() weight_dir = args.weight_dir step_to_load = args.step_to_load task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" save_path = os.path.join( weight_dir, 'testing_' + str(step_to_load), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(save_path) for file in os.listdir(weight_dir): if file.startswith('cfg_sac'): cfg_abs_path = weight_dir + '/' + file # config cfg = YAML().load(open(cfg_abs_path, 'r')) cfg['environment']['num_envs'] = 1 cfg['environment']['num_threads'] = 1 cfg['environment']['control_dt'] = cfg['testing']['control_dt'] cfg['environment']['render'] = cfg['testing']['render'] impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) # seeding seed = cfg['environment']['seed'] torch.manual_seed(seed) utils.set_random_seed(seed) # Set a random seed used in PFRL obs_size = obs_space.low.size action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=cfg['algorithm']['learning_rate']) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam( q_func.parameters(), lr=cfg['algorithm']['learning_rate']) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size']) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=cfg['algorithm']['discount_factor'], replay_start_size=cfg['algorithm']['replay_start_size'], gpu=args.gpu, minibatch_size=cfg['algorithm']['minibatch_size'], burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'], ) agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint') if cfg['testing']['render']: env.wrapper.showWindow() if cfg['testing']['record_video']: env.start_recording_video(save_path + '/test.mp4') test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt']) torch.manual_seed(cfg['environment']['seed']) act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32) _, _, _, new_info = env.step(act, visualize=cfg['testing']['render']) ob = env.reset() try: for i in range(test_steps): if i % 100 == 0: env.reset() with agent.eval_mode(): agent.act_deterministically = True act = agent.batch_act(ob) ob, rew, done, info = env.step(act, visualize=cfg['testing']['render']) except KeyboardInterrupt: pass finally: if cfg['testing']['record_video']: env.stop_recording_video()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument('--cfg_name', type=str, default='/cfg_trpo.yaml', help='configuration file') parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() cfg_name = args.cfg_name device = args.gpu if args.gpu > 0 else 'cpu' task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" env_path = task_path + "/.." cfg_abs_path = task_path + "/.." + cfg_name log_dir = os.path.join(task_path, 'runs/rsl_trpo') save_items = [env_path+'/Environment.hpp', cfg_abs_path, os.path.realpath(__file__)] cfg_saver = ConfigurationSaver(log_dir, save_items, args) # config cfg = YAML().load(open(cfg_abs_path, 'r')) impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt']) total_steps_per_episode = n_steps * cfg['environment']['num_envs'] torch.manual_seed(args.seed) actor_net = rslgym_module.MLP([256, 128], nn.Tanh, env.observation_space.shape[0], env.action_space.shape[0], init_scale=1.4) critic_net = rslgym_module.MLP([256, 128], nn.Tanh, env.observation_space.shape[0], 1, init_scale=1.4) actor = rslgym_module.Actor(actor_net, rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0), env.observation_space.shape[0], env.action_space.shape[0], device) critic = rslgym_module.Critic(critic_net, env.observation_space.shape[0], device) agent = TRPO( actor=actor, critic=critic, num_envs=cfg['environment']['num_envs'], num_transitions_per_env=n_steps, critic_learning_epochs=cfg['algorithm']['critic_learning']['epochs'], critic_learning_rate=cfg['algorithm']['critic_learning']['learning_rate'], critic_mini_batches=cfg['algorithm']['critic_learning']['num_mini_batches'], max_d_kl=cfg['algorithm']['max_kld'], gamma=cfg['algorithm']['discount_factor'], lam=cfg['algorithm']['gae_lam'], entropy_coef=cfg['algorithm']['entropy_coef'], device=device, log_dir=cfg_saver.data_dir, mini_batch_sampling="in_order" ) avg_rewards = [] for update in range(cfg['algorithm']['total_algorithm_updates']): start = time.time() obs = env.reset() reward_ll_sum = 0 ep_len = np.zeros(shape=env.num_envs) ep_len_collected = [] if update % cfg['environment']['eval_every_n'] == 0: env.show_window() if cfg['environment']['record_video']: env.start_recording_video(cfg_saver.data_dir + "/" + str(update) + ".mp4") for step in range(n_steps): action_ll, _ = actor.sample(torch.from_numpy(obs).to(agent.device)) obs, reward_ll, dones, info = env.step(action_ll.cpu().detach().numpy(), True) agent.save_training(cfg_saver.data_dir, update, update) obs = env.reset() if cfg['environment']['record_video']: env.stop_recording_video() env.hide_window() for step in range(n_steps): actor_obs = obs critic_obs = obs action = agent.observe(actor_obs) obs, reward, dones, info = env.step(action, False) agent.step(value_obs=critic_obs, rews=reward, dones=dones, infos=[]) reward_ll_sum = reward_ll_sum + sum(reward) ep_len += 1 if any(dones): ep_len_collected += list(ep_len[dones]) ep_len[dones] = 0 if step == n_steps - 1: for length in list(ep_len): if length == n_steps: ep_len_collected.append(length) agent.update(actor_obs=obs, value_obs=obs, log_this_iteration=update % 10 == 0, update=update) end = time.time() actor.distribution.enforce_minimum_std((torch.ones(12) * 0.2).to(device)) average_ll_performance = reward_ll_sum / total_steps_per_episode avg_rewards.append(average_ll_performance) if len(ep_len_collected)> 0: avg_ep_leng = sum(ep_len_collected)/len(ep_len_collected) #incorrect else: avg_ep_leng = n_steps agent.writer.add_scalar('Policy/average_reward', average_ll_performance, update) agent.writer.add_scalar('Training/elapsed_time_episode', end - start, update) agent.writer.add_scalar('Training/fps', total_steps_per_episode / (end - start), update) agent.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng, update) print('----------------------------------------------------') print('{:>6}th iteration'.format(update)) print('{:<40} {:>6}'.format("average ll reward: ", '{:0.10f}'.format(average_ll_performance))) print('{:<40} {:>6}'.format("avg_ep_len: ", '{:0.6f}'.format(avg_ep_leng))) print('{:<40} {:>6}'.format("time elapsed in this iteration: ", '{:6.4f}'.format(end - start))) print('{:<40} {:>6}'.format("fps: ", '{:6.0f}'.format(total_steps_per_episode / (end - start)))) print('{:<40} {:>6}'.format("std: ", '{}'.format(actor.distribution.log_std.exp()))) print('----------------------------------------------------\n')
def main(): if LooseVersion(torch.__version__) < LooseVersion("1.5.0"): raise Exception("This script requires a PyTorch version >= 1.5.0") # config file arg parser = argparse.ArgumentParser() parser.add_argument('--cfg_name', type=str, default='cfg_sac.yaml', help='configuration file') parser.add_argument("--demo", action="store_true", help="Just run evaluation, not training.") parser.add_argument("--demo-record", action="store_true", help="Save video of demo.") parser.add_argument("--load", type=str, default="", help="Directory to load agent from.") parser.add_argument( "--log-interval", type=int, default=1000, help= "Interval in timesteps between outputting log messages during training", ) parser.add_argument( "--eval-interval", type=int, default=5000, help="Interval in timesteps between evaluations.", ) parser.add_argument( "--checkpoint-interval", type=int, default=5000, help="Interval in timesteps between saving checkpoint", ) parser.add_argument( "--eval-n-runs", type=int, default=10, help="Number of episodes run for each evaluation.", ) parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)') args = parser.parse_args() cfg_name = args.cfg_name # folder config & logdir task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" env_path = task_path + "/.." cfg_abs_path = task_path + "/../" + cfg_name log_dir = os.path.join(task_path, 'runs/pfrl_sac') save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__] if not args.demo: cfg_saver = ConfigurationSaver(log_dir, save_items, args) # environment cfg = YAML().load(open(cfg_abs_path, 'r')) impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) steps_per_episode = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt']) total_steps_per_iteration = steps_per_episode * cfg['environment'][ 'num_envs'] total_training_steps = cfg['algorithm'][ 'total_algorithm_updates'] * total_steps_per_iteration obs_space = env.observation_space action_space = env.action_space print("Observation space:", obs_space) print("Action space:", action_space) # seeding seed = cfg['environment']['seed'] torch.manual_seed(seed) utils.set_random_seed(seed) # Set a random seed used in PFRL obs_size = obs_space.low.size action_size = action_space.low.size def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = torch.chunk(x, 2, dim=1) log_scale = torch.clamp(log_scale, -20.0, 2.0) var = torch.exp(log_scale * 2) base_distribution = distributions.Independent( distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1) # cache_size=1 is required for numerical stability return distributions.transformed_distribution.TransformedDistribution( base_distribution, [distributions.transforms.TanhTransform(cache_size=1)]) policy = nn.Sequential( nn.Linear(obs_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, action_size * 2), Lambda(squashed_diagonal_gaussian_head), ) torch.nn.init.xavier_uniform_(policy[0].weight) torch.nn.init.xavier_uniform_(policy[2].weight) torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0) policy_optimizer = torch.optim.Adam(policy.parameters(), lr=cfg['algorithm']['learning_rate']) def make_q_func_with_optimizer(): q_func = nn.Sequential( pfrl.nn.ConcatObsAndAction(), nn.Linear(obs_size + action_size, 256), nn.ReLU(), nn.Linear(256, 256), nn.ReLU(), nn.Linear(256, 1), ) torch.nn.init.xavier_uniform_(q_func[1].weight) torch.nn.init.xavier_uniform_(q_func[3].weight) torch.nn.init.xavier_uniform_(q_func[5].weight) q_func_optimizer = torch.optim.Adam( q_func.parameters(), lr=cfg['algorithm']['learning_rate']) return q_func, q_func_optimizer q_func1, q_func1_optimizer = make_q_func_with_optimizer() q_func2, q_func2_optimizer = make_q_func_with_optimizer() rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size']) def burnin_action_func(): """Select random actions until model is updated one or more times.""" return np.random.uniform(action_space.low, action_space.high).astype(np.float32) # Hyperparameters in http://arxiv.org/abs/1802.09477 agent = pfrl.agents.SoftActorCritic( policy, q_func1, q_func2, policy_optimizer, q_func1_optimizer, q_func2_optimizer, rbuf, gamma=cfg['algorithm']['discount_factor'], replay_start_size=cfg['algorithm']['replay_start_size'], gpu=args.gpu, minibatch_size=cfg['algorithm']['minibatch_size'], burnin_action_func=burnin_action_func, entropy_target=-action_size, temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'], ) # logger settings logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='') logger = logging.getLogger(__name__) if len(args.load) > 0: agent.load(args.load) if args.demo: if cfg['environment']['render']: env.show_window() if args.demo_record: env.start_recording_video(args.load + "/../demo_" + os.path.basename(args.load) + ".mp4") eval_stats = eval_performance_pfrl( env=env, agent=agent, n_steps=None, n_episodes=args.eval_n_runs, max_episode_len=steps_per_episode, visualize=cfg['environment']['render'], ) if cfg['environment']['render']: if args.demo_record: env.stop_recording_video() env.hide_window() print("n_runs: {} mean: {} median: {} stdev {}".format( args.eval_n_runs, eval_stats["mean"], eval_stats["median"], eval_stats["stdev"], )) else: train_agent_batch_with_evaluation_pfrl( agent=agent, env=env, outdir=cfg_saver.data_dir, steps=total_training_steps, eval_n_steps=steps_per_episode, eval_n_episodes=None, eval_interval=args.eval_interval, log_interval=args.log_interval, max_episode_len=steps_per_episode, visualize=cfg['environment']['render'], use_tensorboard=True, checkpoint_freq=args.checkpoint_interval, logger=logger)
#!/usr/bin/env python3 import os import numpy as np import ruamel.yaml from rslgym.wrapper import VecEnvPython # import python wrapper interface from rslgym_wrapper_anymal import anymal_example_env # import your environment task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" cfg_abs_path = task_path + "/../cfg_ppo.yaml" cfg = ruamel.yaml.YAML().load(open(cfg_abs_path, 'r')) dumped_cfg = ruamel.yaml.dump(cfg['environment'], Dumper=ruamel.yaml.RoundTripDumper) env = VecEnvPython(anymal_example_env(rsc_path, dumped_cfg)) print('action_space ', env.action_space) print('obs_space ', env.observation_space) print('num_envs ', env.num_envs) render = cfg['environment']['render'] if render: env.show_window() obs = env.reset() info = env.get_info() # loop for env for step in range(10000): # action = np.zeros((env.num_envs, env.action_space.shape[0])).astype(np.float32) action = np.random.randn(env.num_envs, env.action_space.shape[0]).astype(np.float32) * 0.1 obs, reward, dones, info = env.step(action, visualize=render)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)") parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained') parser.add_argument('-i', '--iteration', type=int, default=0, help='algo iteration') parser.add_argument('-s', '--seconds', type=int, default=10, help='testing duration') args = parser.parse_args() weight_dir = args.weight_dir iteration = args.iteration task_path = os.path.dirname(os.path.realpath(__file__)) rsc_path = task_path + "/../rsc" env_path = task_path + "/.." save_path = os.path.join( weight_dir, 'testing_' + str(iteration), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(save_path) for file in os.listdir(weight_dir): if file.startswith('cfg_ppo'): cfg_abs_path = weight_dir + '/' + file # config cfg = YAML().load(open(cfg_abs_path, 'r')) cfg['environment']['num_envs'] = 1 impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper)) env = VecEnvPython(impl) obs_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] actor_net = nn.Sequential( rslgym_module.EmpiricalNormalization([obs_size]), rslgym_module.MLP([256, 128], nn.Tanh, obs_size, action_size, init_scale=1.4)) actor = rslgym_module.Actor( actor_net, rslgym_module.MultivariateGaussianDiagonalCovariance( env.action_space.shape[0], 1.0), env.observation_space.shape[0], env.action_space.shape[0], 'cpu') snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt') actor.load_state_dict(snapshot['actor_state_dict']) if cfg['environment']['render']: env.wrapper.showWindow() if cfg['environment']['record_video']: env.start_recording_video(save_path + '/test.mp4') test_steps = int(args.seconds / cfg['environment']['control_dt']) torch.manual_seed(args.seed) act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32) _, _, _, new_info = env.step(act, visualize=cfg['environment']['render']) ob = env.reset() try: for i in range(test_steps): if i % 100 == 0: env.reset() act = actor.noiseless_action(ob).cpu().detach().numpy() ob, rew, done, info = env.step( act, visualize=cfg['environment']['render']) except KeyboardInterrupt: pass finally: if cfg['environment']['record_video']: env.stop_recording_video()