Пример #1
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    parser = argparse.ArgumentParser()
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-s',
                        '--step_to_load',
                        type=int,
                        default=0,
                        help='step checkpoint to load')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    step_to_load = args.step_to_load

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    save_path = os.path.join(
        weight_dir, 'testing_' + str(step_to_load),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_sac'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1
    cfg['environment']['num_threads'] = 1
    cfg['environment']['control_dt'] = cfg['testing']['control_dt']
    cfg['environment']['render'] = cfg['testing']['render']

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )

    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    agent.load(weight_dir + '/' + str(step_to_load) + '_checkpoint')

    if cfg['testing']['render']:
        env.wrapper.showWindow()

    if cfg['testing']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(cfg['testing']['seconds'] / cfg['testing']['control_dt'])

    torch.manual_seed(cfg['environment']['seed'])

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['testing']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            with agent.eval_mode():
                agent.act_deterministically = True
                act = agent.batch_act(ob)

            ob, rew, done, info = env.step(act,
                                           visualize=cfg['testing']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['testing']['record_video']:
            env.stop_recording_video()
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument('--cfg_name', type=str, default='/cfg_trpo.yaml', help='configuration file')
    parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/.." + cfg_name
    log_dir = os.path.join(task_path, 'runs/rsl_trpo')

    save_items = [env_path+'/Environment.hpp',
                  cfg_abs_path,
                  os.path.realpath(__file__)]

    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt'])

    total_steps_per_episode = n_steps * cfg['environment']['num_envs']

    torch.manual_seed(args.seed)

    actor_net = rslgym_module.MLP([256, 128],
                                  nn.Tanh,
                                  env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  init_scale=1.4)

    critic_net = rslgym_module.MLP([256, 128],
                                   nn.Tanh,
                                   env.observation_space.shape[0],
                                   1,
                                   init_scale=1.4)

    actor = rslgym_module.Actor(actor_net,
                  rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0),
                  env.observation_space.shape[0], env.action_space.shape[0],
                  device)

    critic = rslgym_module.Critic(critic_net,
                    env.observation_space.shape[0],
                    device)

    agent = TRPO(
        actor=actor,
        critic=critic,
        num_envs=cfg['environment']['num_envs'],
        num_transitions_per_env=n_steps,
        critic_learning_epochs=cfg['algorithm']['critic_learning']['epochs'],
        critic_learning_rate=cfg['algorithm']['critic_learning']['learning_rate'],
        critic_mini_batches=cfg['algorithm']['critic_learning']['num_mini_batches'],
        max_d_kl=cfg['algorithm']['max_kld'],
        gamma=cfg['algorithm']['discount_factor'],
        lam=cfg['algorithm']['gae_lam'],
        entropy_coef=cfg['algorithm']['entropy_coef'],
        device=device,
        log_dir=cfg_saver.data_dir,
        mini_batch_sampling="in_order"
    )

    avg_rewards = []
    for update in range(cfg['algorithm']['total_algorithm_updates']):

        start = time.time()
        obs = env.reset()

        reward_ll_sum = 0
        ep_len = np.zeros(shape=env.num_envs)
        ep_len_collected = []

        if update % cfg['environment']['eval_every_n'] == 0:
            env.show_window()
            if cfg['environment']['record_video']:
                env.start_recording_video(cfg_saver.data_dir + "/" + str(update) + ".mp4")
            for step in range(n_steps):
                action_ll, _ = actor.sample(torch.from_numpy(obs).to(agent.device))
                obs, reward_ll, dones, info = env.step(action_ll.cpu().detach().numpy(), True)

            agent.save_training(cfg_saver.data_dir, update, update)
            obs = env.reset()
            if cfg['environment']['record_video']:
                env.stop_recording_video()
            env.hide_window()

        for step in range(n_steps):
            actor_obs = obs
            critic_obs = obs
            action = agent.observe(actor_obs)
            obs, reward, dones, info = env.step(action, False)
            agent.step(value_obs=critic_obs, rews=reward, dones=dones, infos=[])
            reward_ll_sum = reward_ll_sum + sum(reward)

            ep_len += 1
            if any(dones):
                ep_len_collected += list(ep_len[dones])
                ep_len[dones] = 0
            if step == n_steps - 1:
                for length in list(ep_len):
                    if length == n_steps:
                        ep_len_collected.append(length)

        agent.update(actor_obs=obs,
                     value_obs=obs,
                     log_this_iteration=update % 10 == 0,
                     update=update)
        end = time.time()
        actor.distribution.enforce_minimum_std((torch.ones(12) * 0.2).to(device))

        average_ll_performance = reward_ll_sum / total_steps_per_episode
        avg_rewards.append(average_ll_performance)
        if len(ep_len_collected)> 0:
            avg_ep_leng = sum(ep_len_collected)/len(ep_len_collected) #incorrect
        else:
            avg_ep_leng = n_steps

        agent.writer.add_scalar('Policy/average_reward', average_ll_performance, update)
        agent.writer.add_scalar('Training/elapsed_time_episode', end - start, update)
        agent.writer.add_scalar('Training/fps', total_steps_per_episode / (end - start), update)
        agent.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng, update)

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ", '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("avg_ep_len: ", '{:0.6f}'.format(avg_ep_leng)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ", '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format("fps: ", '{:6.0f}'.format(total_steps_per_episode / (end - start))))
        print('{:<40} {:>6}'.format("std: ", '{}'.format(actor.distribution.log_std.exp())))
        print('----------------------------------------------------\n')
Пример #3
0
def main():
    if LooseVersion(torch.__version__) < LooseVersion("1.5.0"):
        raise Exception("This script requires a PyTorch version >= 1.5.0")

    # config file arg
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='cfg_sac.yaml',
                        help='configuration file')
    parser.add_argument("--demo",
                        action="store_true",
                        help="Just run evaluation, not training.")
    parser.add_argument("--demo-record",
                        action="store_true",
                        help="Save video of demo.")
    parser.add_argument("--load",
                        type=str,
                        default="",
                        help="Directory to load agent from.")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=1000,
        help=
        "Interval in timesteps between outputting log messages during training",
    )
    parser.add_argument(
        "--eval-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between evaluations.",
    )
    parser.add_argument(
        "--checkpoint-interval",
        type=int,
        default=5000,
        help="Interval in timesteps between saving checkpoint",
    )
    parser.add_argument(
        "--eval-n-runs",
        type=int,
        default=10,
        help="Number of episodes run for each evaluation.",
    )
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    # folder config & logdir
    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name
    log_dir = os.path.join(task_path, 'runs/pfrl_sac')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path, __file__]
    if not args.demo:
        cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # environment
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    steps_per_episode = math.floor(cfg['environment']['max_time'] /
                                   cfg['environment']['control_dt'])
    total_steps_per_iteration = steps_per_episode * cfg['environment'][
        'num_envs']

    total_training_steps = cfg['algorithm'][
        'total_algorithm_updates'] * total_steps_per_iteration

    obs_space = env.observation_space
    action_space = env.action_space
    print("Observation space:", obs_space)
    print("Action space:", action_space)

    # seeding
    seed = cfg['environment']['seed']
    torch.manual_seed(seed)
    utils.set_random_seed(seed)  # Set a random seed used in PFRL

    obs_size = obs_space.low.size
    action_size = action_space.low.size

    def squashed_diagonal_gaussian_head(x):
        assert x.shape[-1] == action_size * 2
        mean, log_scale = torch.chunk(x, 2, dim=1)
        log_scale = torch.clamp(log_scale, -20.0, 2.0)
        var = torch.exp(log_scale * 2)
        base_distribution = distributions.Independent(
            distributions.Normal(loc=mean, scale=torch.sqrt(var)), 1)
        # cache_size=1 is required for numerical stability
        return distributions.transformed_distribution.TransformedDistribution(
            base_distribution,
            [distributions.transforms.TanhTransform(cache_size=1)])

    policy = nn.Sequential(
        nn.Linear(obs_size, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, action_size * 2),
        Lambda(squashed_diagonal_gaussian_head),
    )
    torch.nn.init.xavier_uniform_(policy[0].weight)
    torch.nn.init.xavier_uniform_(policy[2].weight)
    torch.nn.init.xavier_uniform_(policy[4].weight, gain=1.0)
    policy_optimizer = torch.optim.Adam(policy.parameters(),
                                        lr=cfg['algorithm']['learning_rate'])

    def make_q_func_with_optimizer():
        q_func = nn.Sequential(
            pfrl.nn.ConcatObsAndAction(),
            nn.Linear(obs_size + action_size, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )
        torch.nn.init.xavier_uniform_(q_func[1].weight)
        torch.nn.init.xavier_uniform_(q_func[3].weight)
        torch.nn.init.xavier_uniform_(q_func[5].weight)
        q_func_optimizer = torch.optim.Adam(
            q_func.parameters(), lr=cfg['algorithm']['learning_rate'])
        return q_func, q_func_optimizer

    q_func1, q_func1_optimizer = make_q_func_with_optimizer()
    q_func2, q_func2_optimizer = make_q_func_with_optimizer()

    rbuf = replay_buffers.ReplayBuffer(cfg['algorithm']['replay_buffer_size'])

    def burnin_action_func():
        """Select random actions until model is updated one or more times."""
        return np.random.uniform(action_space.low,
                                 action_space.high).astype(np.float32)

    # Hyperparameters in http://arxiv.org/abs/1802.09477
    agent = pfrl.agents.SoftActorCritic(
        policy,
        q_func1,
        q_func2,
        policy_optimizer,
        q_func1_optimizer,
        q_func2_optimizer,
        rbuf,
        gamma=cfg['algorithm']['discount_factor'],
        replay_start_size=cfg['algorithm']['replay_start_size'],
        gpu=args.gpu,
        minibatch_size=cfg['algorithm']['minibatch_size'],
        burnin_action_func=burnin_action_func,
        entropy_target=-action_size,
        temperature_optimizer_lr=cfg['algorithm']['temperature_optimizer_lr'],
    )

    # logger settings
    logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
    logger = logging.getLogger(__name__)

    if len(args.load) > 0:
        agent.load(args.load)

    if args.demo:
        if cfg['environment']['render']:
            env.show_window()
            if args.demo_record:
                env.start_recording_video(args.load + "/../demo_" +
                                          os.path.basename(args.load) + ".mp4")
        eval_stats = eval_performance_pfrl(
            env=env,
            agent=agent,
            n_steps=None,
            n_episodes=args.eval_n_runs,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
        )
        if cfg['environment']['render']:
            if args.demo_record:
                env.stop_recording_video()
            env.hide_window()
        print("n_runs: {} mean: {} median: {} stdev {}".format(
            args.eval_n_runs,
            eval_stats["mean"],
            eval_stats["median"],
            eval_stats["stdev"],
        ))
    else:
        train_agent_batch_with_evaluation_pfrl(
            agent=agent,
            env=env,
            outdir=cfg_saver.data_dir,
            steps=total_training_steps,
            eval_n_steps=steps_per_episode,
            eval_n_episodes=None,
            eval_interval=args.eval_interval,
            log_interval=args.log_interval,
            max_episode_len=steps_per_episode,
            visualize=cfg['environment']['render'],
            use_tensorboard=True,
            checkpoint_freq=args.checkpoint_interval,
            logger=logger)
Пример #4
0
#!/usr/bin/env python3
import os
import numpy as np
import ruamel.yaml

from rslgym.wrapper import VecEnvPython  # import python wrapper interface
from rslgym_wrapper_anymal import anymal_example_env  # import your environment

task_path = os.path.dirname(os.path.realpath(__file__))
rsc_path = task_path + "/../rsc"
cfg_abs_path = task_path + "/../cfg_ppo.yaml"
cfg = ruamel.yaml.YAML().load(open(cfg_abs_path, 'r'))

dumped_cfg = ruamel.yaml.dump(cfg['environment'], Dumper=ruamel.yaml.RoundTripDumper)
env = VecEnvPython(anymal_example_env(rsc_path, dumped_cfg))

print('action_space ', env.action_space)
print('obs_space ', env.observation_space)
print('num_envs ', env.num_envs)

render = cfg['environment']['render']
if render:
    env.show_window()

obs = env.reset()
info = env.get_info()
# loop for env
for step in range(10000):
    # action = np.zeros((env.num_envs, env.action_space.shape[0])).astype(np.float32)
    action = np.random.randn(env.num_envs, env.action_space.shape[0]).astype(np.float32) * 0.1
    obs, reward, dones, info = env.step(action, visualize=render)
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-i',
                        '--iteration',
                        type=int,
                        default=0,
                        help='algo iteration')
    parser.add_argument('-s',
                        '--seconds',
                        type=int,
                        default=10,
                        help='testing duration')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    save_path = os.path.join(
        weight_dir, 'testing_' + str(iteration),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_ppo'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor_net = nn.Sequential(
        rslgym_module.EmpiricalNormalization([obs_size]),
        rslgym_module.MLP([256, 128],
                          nn.Tanh,
                          obs_size,
                          action_size,
                          init_scale=1.4))

    actor = rslgym_module.Actor(
        actor_net,
        rslgym_module.MultivariateGaussianDiagonalCovariance(
            env.action_space.shape[0], 1.0), env.observation_space.shape[0],
        env.action_space.shape[0], 'cpu')

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.seconds / cfg['environment']['control_dt'])

    torch.manual_seed(args.seed)

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, rew, done, info = env.step(
                act, visualize=cfg['environment']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()