Пример #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w', '--weight_dir', type=str, default='', help='path to trained')
    parser.add_argument('-i', '--iteration', type=int, default=0, help='algo iteration')
    parser.add_argument('-s', '--seconds', type=int, default=10, help='testing duration')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    save_path = os.path.join(weight_dir, 'testing_' + str(iteration), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1

    impl = cart_pole_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    actor_net = rslgym_module.MLP([32, 32],
                               nn.Tanh,
                               env.observation_space.shape[0],
                               env.action_space.shape[0])

    actor = rslgym_module.Actor(actor_net,
                             rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0),
                             env.observation_space.shape[0], env.action_space.shape[0],
                             'cpu')

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.seconds/cfg['environment']['control_dt'])

    torch.manual_seed(args.seed)

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    # containers for analysis
    actions = np.zeros(shape=(2, test_steps), dtype=np.float32)
    obs = np.zeros(shape=(4, test_steps), dtype=np.float32)

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, rew, done, info = env.step(act, visualize=cfg['environment']['render'])
            obs[:, i] = ob
            actions[0, i] = info['action']
            actions[1, i] = act

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()

        if cfg['environment']['render']:
            env.wrapper.hideWindow()

        import matplotlib
        matplotlib.use('TKAgg')
        import matplotlib.pyplot as plt

        plt.figure()
        plt.plot(actions[0, :], label='applied action')
        plt.plot(actions[1, :], label='nn action')
        plt.grid()
        plt.legend()

        plt.figure()
        plt.plot(obs[0, :], label='cart pos')
        plt.plot(obs[2, :], label='cart vel')
        plt.grid()
        plt.legend()

        plt.figure()
        plt.plot(obs[1, :], label='pend pos')
        plt.plot(obs[3, :], label='pend vel')
        plt.grid()
        plt.legend()

        plt.show(block=False)
        input('press [ENTER] to exit')
Пример #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name',
                        type=str,
                        default='/cfg.yaml',
                        help='configuration file')
    parser.add_argument('--gpu',
                        type=int,
                        default=0,
                        help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/../" + cfg_name

    log_dir = os.path.join(task_path, 'runs/rsl_ppo')

    save_items = [env_path + '/Environment.hpp', cfg_abs_path]
    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = cart_pole_example_env(
        rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    n_steps = math.floor(cfg['environment']['max_time'] /
                         cfg['environment']['control_dt'])

    total_steps_per_episode = n_steps * cfg['environment']['num_envs']

    torch.manual_seed(cfg['environment']['seed'])

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    actor_net = rslgym_module.MLP([32, 32], nn.Tanh, obs_size, action_size,
                                  0.5)
    critic_net = rslgym_module.MLP([32, 32], nn.Tanh, obs_size, 1, 0.5)
    actor_dist = rslgym_module.MultivariateGaussianDiagonalCovariance(
        action_size, 1.0)

    actor = rslgym_module.Actor(actor_net, actor_dist, obs_size, action_size,
                                device)
    critic = rslgym_module.Critic(critic_net, obs_size, device)

    ppo_training = PPO(actor=actor,
                       critic=critic,
                       num_envs=cfg['environment']['num_envs'],
                       num_transitions_per_env=n_steps,
                       num_learning_epochs=cfg['algorithm']['num_epochs'],
                       gamma=cfg['algorithm']['discount_factor'],
                       lam=cfg['algorithm']['gae_lam'],
                       entropy_coef=cfg['algorithm']['ent_coef'],
                       num_mini_batches=cfg['algorithm']['num_mini_batches'],
                       device=device,
                       log_dir=cfg_saver.data_dir,
                       mini_batch_sampling="in_order",
                       learning_rate=cfg['algorithm']['learning_rate'])

    avg_rewards = []
    fig, ax = plt.subplots()
    for update in range(cfg['algorithm']['total_algo_updates']):

        start = time.time()
        obs = env.reset()
        reward_ll_sum = 0
        done_sum = 0
        # just keep the number of consecutive up to the latest "done"
        # can be that one env terminates multiple time, count is reset if done is received
        ep_len = np.zeros(shape=env.num_envs)

        if update % 20 == 0:
            env.show_window()
            env.start_recording_video(cfg_saver.data_dir + "/" + str(update) +
                                      ".mp4")
            for step in range(1 * n_steps):
                action_ll, _ = actor.sample(
                    torch.from_numpy(obs).to(ppo_training.device))
                t = time.time()
                obs, reward_ll, dones, _ = env.step(
                    action_ll.cpu().detach().numpy(), True)
                # print(time.time()-t)

            ppo_training.save_training(cfg_saver.data_dir, update, update)
            obs = env.reset()
            env.stop_recording_video()
            env.hide_window()

        for step in range(n_steps):
            actor_obs = obs
            critic_obs = obs
            action = ppo_training.observe(actor_obs)
            obs, reward, dones, _ = env.step(action, False)
            ep_len[~dones] += 1
            ep_len[dones] = 0
            ppo_training.step(value_obs=critic_obs,
                              rews=reward,
                              dones=dones,
                              infos=[])
            done_sum = done_sum + sum(dones)
            reward_ll_sum = reward_ll_sum + sum(reward)

        ppo_training.update(actor_obs=obs,
                            value_obs=obs,
                            log_this_iteration=update % 10 == 0,
                            update=update)
        end = time.time()

        average_ll_performance = reward_ll_sum / total_steps_per_episode
        average_dones = done_sum / total_steps_per_episode
        avg_rewards.append(average_ll_performance)
        avg_ep_leng = ep_len.mean()

        ppo_training.writer.add_scalar('Policy/average_reward',
                                       average_ll_performance, update)
        ppo_training.writer.add_scalar('Policy/average_dones', average_dones,
                                       update)
        ppo_training.writer.add_scalar('Training/elapsed_time_episode',
                                       end - start, update)
        ppo_training.writer.add_scalar('Training/fps',
                                       total_steps_per_episode / (end - start),
                                       update)
        ppo_training.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng,
                                       update)

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ",
                                    '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("dones: ",
                                    '{:0.6f}'.format(average_dones)))
        print('{:<40} {:>6}'.format("avg_ep_len: ",
                                    '{:0.6f}'.format(avg_ep_leng)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ",
                                    '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format(
            "fps: ",
            '{:6.0f}'.format(total_steps_per_episode / (end - start))))
        print('----------------------------------------------------\n')

        if update > 100 and len(avg_rewards) > 100:
            ax.plot(range(len(avg_rewards)), savgol_filter(avg_rewards, 51, 3))
        else:
            ax.plot(range(len(avg_rewards)), avg_rewards)
        fig.savefig(cfg_saver.data_dir + '/demo.png', bbox_inches='tight')

        ax.clear()
Пример #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-i',
                        '--iteration',
                        type=int,
                        default=0,
                        help='algo iteration')
    parser.add_argument('-s',
                        '--seconds',
                        type=int,
                        default=10,
                        help='testing duration')

    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    save_path = os.path.join(
        weight_dir, 'testing_' + str(iteration),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))

    # single env for testing
    test_env = gym.make(cfg['environment']['env_name'])
    test_env.seed(cfg['environment']['seed'])

    # https://github.com/openai/gym/issues/1925
    if cfg['environment']['record_video']:
        test_env = wrappers.Monitor(test_env,
                                    save_path,
                                    force=True,
                                    video_callable=lambda episode: True)

    obs_space = test_env.observation_space
    action_space = test_env.action_space

    actor_architecture = [64, 64]

    # obs_normalizer = normalizer.RunningMeanStd(shape=[obs_space.low.size])
    obs_normalizer = None

    torch.manual_seed(cfg['environment']['seed'])

    actor_net = nn.Sequential(
        rslgym_module.EmpiricalNormalization([obs_space.low.size]),
        rslgym_module.MLP(actor_architecture, nn.LeakyReLU, obs_space.low.size,
                          action_space.low.size))

    actor = rslgym_module.Actor(
        actor_net,
        rslgym_module.MultivariateGaussianDiagonalCovariance(
            action_space.low.size, 1.0), obs_space.low.size,
        action_space.low.size, 'cpu')

    # load actor weights

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    test_steps = test_env.spec.max_episode_steps

    torch.manual_seed(args.seed)

    # containers for analysis
    actions = np.zeros(shape=(action_space.low.size, test_steps),
                       dtype=np.float32)
    obs = np.zeros(shape=(obs_space.low.size, test_steps), dtype=np.float32)
    rews = np.zeros(shape=(1, test_steps), dtype=np.float32)

    ob = test_env.reset()
    ob = np.array(ob).reshape(1, -1).astype(np.float32)

    try:
        for i in range(test_steps):
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, r, done, info = test_env.step(act[0])
            ob = np.array(ob).reshape(1, -1).astype(np.float32)
            if cfg['environment']['render']:
                test_env.render()

            obs[:, i] = ob
            actions[:, i] = act
            rews[:, i] = r

            if done:
                break

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            # close video recording wrapper
            test_env.close()

        plt.figure()
        for i in range(action_space.low.size):
            plt.plot(actions[i, :], label='ac_' + str(i))
        plt.grid()
        plt.legend()

        plt.figure()
        for i in range(obs_space.low.size):
            plt.plot(obs[i, :], label='ob_' + str(i))
        plt.grid()
        plt.legend()

        plt.figure()
        plt.plot(rews[0, :], label='reward')
        plt.grid()
        plt.legend()

        plt.show(block=False)
        input('press [ENTER] to exit')
Пример #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed", type=int, default=0, help="Random seed [0, 2 ** 32)")
    parser.add_argument('--cfg_name', type=str, default='/cfg_trpo.yaml', help='configuration file')
    parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    cfg_abs_path = task_path + "/.." + cfg_name
    log_dir = os.path.join(task_path, 'runs/rsl_trpo')

    save_items = [env_path+'/Environment.hpp',
                  cfg_abs_path,
                  os.path.realpath(__file__)]

    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    impl = anymal_example_env(rsc_path, dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)
    n_steps = math.floor(cfg['environment']['max_time'] / cfg['environment']['control_dt'])

    total_steps_per_episode = n_steps * cfg['environment']['num_envs']

    torch.manual_seed(args.seed)

    actor_net = rslgym_module.MLP([256, 128],
                                  nn.Tanh,
                                  env.observation_space.shape[0],
                                  env.action_space.shape[0],
                                  init_scale=1.4)

    critic_net = rslgym_module.MLP([256, 128],
                                   nn.Tanh,
                                   env.observation_space.shape[0],
                                   1,
                                   init_scale=1.4)

    actor = rslgym_module.Actor(actor_net,
                  rslgym_module.MultivariateGaussianDiagonalCovariance(env.action_space.shape[0], 1.0),
                  env.observation_space.shape[0], env.action_space.shape[0],
                  device)

    critic = rslgym_module.Critic(critic_net,
                    env.observation_space.shape[0],
                    device)

    agent = TRPO(
        actor=actor,
        critic=critic,
        num_envs=cfg['environment']['num_envs'],
        num_transitions_per_env=n_steps,
        critic_learning_epochs=cfg['algorithm']['critic_learning']['epochs'],
        critic_learning_rate=cfg['algorithm']['critic_learning']['learning_rate'],
        critic_mini_batches=cfg['algorithm']['critic_learning']['num_mini_batches'],
        max_d_kl=cfg['algorithm']['max_kld'],
        gamma=cfg['algorithm']['discount_factor'],
        lam=cfg['algorithm']['gae_lam'],
        entropy_coef=cfg['algorithm']['entropy_coef'],
        device=device,
        log_dir=cfg_saver.data_dir,
        mini_batch_sampling="in_order"
    )

    avg_rewards = []
    for update in range(cfg['algorithm']['total_algorithm_updates']):

        start = time.time()
        obs = env.reset()

        reward_ll_sum = 0
        ep_len = np.zeros(shape=env.num_envs)
        ep_len_collected = []

        if update % cfg['environment']['eval_every_n'] == 0:
            env.show_window()
            if cfg['environment']['record_video']:
                env.start_recording_video(cfg_saver.data_dir + "/" + str(update) + ".mp4")
            for step in range(n_steps):
                action_ll, _ = actor.sample(torch.from_numpy(obs).to(agent.device))
                obs, reward_ll, dones, info = env.step(action_ll.cpu().detach().numpy(), True)

            agent.save_training(cfg_saver.data_dir, update, update)
            obs = env.reset()
            if cfg['environment']['record_video']:
                env.stop_recording_video()
            env.hide_window()

        for step in range(n_steps):
            actor_obs = obs
            critic_obs = obs
            action = agent.observe(actor_obs)
            obs, reward, dones, info = env.step(action, False)
            agent.step(value_obs=critic_obs, rews=reward, dones=dones, infos=[])
            reward_ll_sum = reward_ll_sum + sum(reward)

            ep_len += 1
            if any(dones):
                ep_len_collected += list(ep_len[dones])
                ep_len[dones] = 0
            if step == n_steps - 1:
                for length in list(ep_len):
                    if length == n_steps:
                        ep_len_collected.append(length)

        agent.update(actor_obs=obs,
                     value_obs=obs,
                     log_this_iteration=update % 10 == 0,
                     update=update)
        end = time.time()
        actor.distribution.enforce_minimum_std((torch.ones(12) * 0.2).to(device))

        average_ll_performance = reward_ll_sum / total_steps_per_episode
        avg_rewards.append(average_ll_performance)
        if len(ep_len_collected)> 0:
            avg_ep_leng = sum(ep_len_collected)/len(ep_len_collected) #incorrect
        else:
            avg_ep_leng = n_steps

        agent.writer.add_scalar('Policy/average_reward', average_ll_performance, update)
        agent.writer.add_scalar('Training/elapsed_time_episode', end - start, update)
        agent.writer.add_scalar('Training/fps', total_steps_per_episode / (end - start), update)
        agent.writer.add_scalar('Policy/avg_ep_len', avg_ep_leng, update)

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ", '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("avg_ep_len: ", '{:0.6f}'.format(avg_ep_leng)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ", '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format("fps: ", '{:6.0f}'.format(total_steps_per_episode / (end - start))))
        print('{:<40} {:>6}'.format("std: ", '{}'.format(actor.distribution.log_std.exp())))
        print('----------------------------------------------------\n')
Пример #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cfg_name', type=str, default='/cfg.yaml', help='configuration file')
    parser.add_argument('--gpu', type=int, default=0, help='gpu id (-1 for cpu)')
    args = parser.parse_args()
    cfg_name = args.cfg_name

    device = args.gpu if args.gpu > 0 else 'cpu'

    task_path = os.path.dirname(os.path.realpath(__file__))
    cfg_abs_path = task_path + "/../" + cfg_name
    log_dir = os.path.join(task_path, 'runs/rsl_ppo')

    save_items = [cfg_abs_path]
    cfg_saver = ConfigurationSaver(log_dir, save_items, args)

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))

    num_envs = cfg['environment']['num_envs']
    process_seeds = np.arange(num_envs) + cfg['environment']['seed'] * num_envs
    assert process_seeds.max() < 2 ** 32

    def make_env(process_idx, test):
        env = gym.make(cfg['environment']['env_name'])
        process_seed = int(process_seeds[process_idx])
        env.seed(process_seed)
        return env

    def make_batch_env(test, n_envs):
        return MultiprocessVectorEnv(
            [
                functools.partial(make_env, idx, test)
                for idx, env in enumerate(range(n_envs))
            ]
        )

    # batch env for training
    env = make_batch_env(False, num_envs)

    # single env for testing
    test_env = gym.make(cfg['environment']['env_name'])
    test_env.seed(cfg['environment']['seed'])

    # https://github.com/openai/gym/issues/1925#issuecomment-753465510
    if cfg['environment']['record_video']:
        test_env = wrappers.Monitor(test_env, cfg_saver.data_dir, force=True, video_callable=lambda episode: True)

    max_episode_steps = test_env.spec.max_episode_steps
    obs_space = test_env.observation_space
    action_space = test_env.action_space
    total_steps = max_episode_steps * num_envs

    actor_architecture = [64, 64]
    value_net_architecture = [64, 64]

    torch.manual_seed(cfg['environment']['seed'])

    actor_net = nn.Sequential(
                    rslgym_module.EmpiricalNormalization([obs_space.low.size]),
                    rslgym_module.MLP(actor_architecture,
                        nn.LeakyReLU,
                        obs_space.low.size,
                        action_space.low.size)
                    )
    critic_net = nn.Sequential(
                    rslgym_module.EmpiricalNormalization([obs_space.low.size]),
                    rslgym_module.MLP(value_net_architecture,
                        nn.LeakyReLU,
                        obs_space.low.size,
                        1)
                    )

    actor = rslgym_module.Actor(actor_net,
                  rslgym_module.MultivariateGaussianDiagonalCovariance(action_space.low.size, 1.0),
                  obs_space.low.size,
                  action_space.low.size,
                  device)

    critic = rslgym_module.Critic(critic_net, obs_space.low.size, device)

    agent = PPO(actor=actor,
                critic=critic,
                num_envs=num_envs,
                num_transitions_per_env=max_episode_steps,
                num_learning_epochs=cfg['algorithm']['num_epochs'],
                learning_rate=cfg['algorithm']['learning_rate'],
                gamma=cfg['algorithm']['discount_factor'],
                lam=cfg['algorithm']['gae_lam'],
                entropy_coef=cfg['algorithm']['ent_coef'],
                num_mini_batches=cfg['algorithm']['num_mini_batches'],
                device=device,
                log_dir=cfg_saver.data_dir,
                mini_batch_sampling='in_order',
                )

    def obs_to_numpy(obs):
        o = np.array(obs).reshape(len(obs), -1).astype(np.float32)
        return o

    avg_rewards = []
    fig, ax = plt.subplots()
    env.reset()
    obs = env.get_observation()
    obs = obs_to_numpy(obs)
    episode_len = np.zeros(num_envs, dtype="i")

    for update in range(cfg['algorithm']['total_algo_updates']):
        ax.set(xlabel='iteration', ylabel='avg performance', title='average performance')
        ax.grid()
        start = time.time()
        reward_ll_sum = 0
        done_sum = 0
        average_dones = 0.

        # evaluate
        if update % 50 == 0:
            obs_sample = test_env.reset()
            obs_sample = np.array(obs_sample).reshape(1, -1).astype(np.float32)
            for step in range(max_episode_steps):
                action = agent.observe(obs_sample)
                obs_sample, r, dones, _ = test_env.step(action[0])
                obs_sample = np.array(obs_sample).reshape(1, -1).astype(np.float32)
                # reset
                if cfg['environment']['render']:
                    test_env.render()
                if dones:
                    obs_sample = test_env.reset()
                    obs_sample = np.array(obs_sample).reshape(1, -1).astype(np.float32)
                    break

            agent.save_training(cfg_saver.data_dir, update, update)

        for step in range(cfg['environment']['steps_per_env_and_episode']):
            episode_len += 1
            actor_obs = obs
            critic_obs = obs
            action = agent.observe(actor_obs)
            reward, dones, infos = env.step(action)
            obs = env.get_observation()
            obs = obs_to_numpy(obs)
            reward = np.array(reward)
            dones = np.array(dones)
            resets = episode_len == max_episode_steps
            end = np.logical_or(resets, dones)
            not_end = np.logical_not(end)
            episode_len[end] = 0

            agent.step(value_obs=critic_obs, rews=reward, dones=dones, infos=[])
            done_sum = done_sum + sum(dones)
            reward_ll_sum = reward_ll_sum + sum(reward)
            env.reset(not_end)
            obs = env.get_observation()
            obs = obs_to_numpy(obs)
        mid = time.time()
        agent.update(actor_obs=obs,
                     value_obs=obs,
                     log_this_iteration=update % 10 == 0,
                     update=update)

        end = time.time()
        average_ll_performance = reward_ll_sum / total_steps
        average_dones = done_sum / total_steps
        avg_rewards.append(average_ll_performance)

        actor.distribution.enforce_minimum_std((torch.ones(action_space.low.size)*0.2).to(device))

        if update > 100 and len(avg_rewards) > 100:
            ax.plot(range(len(avg_rewards)), savgol_filter(avg_rewards, 51, 3))
        else:
            ax.plot(range(len(avg_rewards)), avg_rewards)
        fig.savefig(cfg_saver.data_dir + '/demo.png', bbox_inches='tight')

        ax.clear()

        print('----------------------------------------------------')
        print('{:>6}th iteration'.format(update))
        print('{:<40} {:>6}'.format("average ll reward: ", '{:0.10f}'.format(average_ll_performance)))
        print('{:<40} {:>6}'.format("dones: ", '{:0.6f}'.format(average_dones)))
        print('{:<40} {:>6}'.format("time elapsed in this iteration: ", '{:6.4f}'.format(end - start)))
        print('{:<40} {:>6}'.format("fps: ", '{:6.0f}'.format(total_steps / (end - start))))
        print('std: ')
        print(np.exp(actor.distribution.log_std.cpu().detach().numpy()))
        print('----------------------------------------------------\n')
Пример #6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seed",
                        type=int,
                        default=0,
                        help="Random seed [0, 2 ** 32)")
    parser.add_argument('-w',
                        '--weight_dir',
                        type=str,
                        default='',
                        help='path to trained')
    parser.add_argument('-i',
                        '--iteration',
                        type=int,
                        default=0,
                        help='algo iteration')
    parser.add_argument('-s',
                        '--seconds',
                        type=int,
                        default=10,
                        help='testing duration')
    args = parser.parse_args()
    weight_dir = args.weight_dir
    iteration = args.iteration

    task_path = os.path.dirname(os.path.realpath(__file__))
    rsc_path = task_path + "/../rsc"
    env_path = task_path + "/.."
    save_path = os.path.join(
        weight_dir, 'testing_' + str(iteration),
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(save_path)

    for file in os.listdir(weight_dir):
        if file.startswith('cfg_ppo'):
            cfg_abs_path = weight_dir + '/' + file

    # config
    cfg = YAML().load(open(cfg_abs_path, 'r'))
    cfg['environment']['num_envs'] = 1

    impl = anymal_example_env(rsc_path,
                              dump(cfg['environment'], Dumper=RoundTripDumper))
    env = VecEnvPython(impl)

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    actor_net = nn.Sequential(
        rslgym_module.EmpiricalNormalization([obs_size]),
        rslgym_module.MLP([256, 128],
                          nn.Tanh,
                          obs_size,
                          action_size,
                          init_scale=1.4))

    actor = rslgym_module.Actor(
        actor_net,
        rslgym_module.MultivariateGaussianDiagonalCovariance(
            env.action_space.shape[0], 1.0), env.observation_space.shape[0],
        env.action_space.shape[0], 'cpu')

    snapshot = torch.load(weight_dir + '/snapshot' + str(iteration) + '.pt')
    actor.load_state_dict(snapshot['actor_state_dict'])

    if cfg['environment']['render']:
        env.wrapper.showWindow()

    if cfg['environment']['record_video']:
        env.start_recording_video(save_path + '/test.mp4')

    test_steps = int(args.seconds / cfg['environment']['control_dt'])

    torch.manual_seed(args.seed)

    act = np.ndarray(shape=(1, env.wrapper.getActionDim()), dtype=np.float32)
    _, _, _, new_info = env.step(act, visualize=cfg['environment']['render'])

    ob = env.reset()
    try:
        for i in range(test_steps):
            if i % 100 == 0:
                env.reset()
            act = actor.noiseless_action(ob).cpu().detach().numpy()
            ob, rew, done, info = env.step(
                act, visualize=cfg['environment']['render'])

    except KeyboardInterrupt:
        pass

    finally:
        if cfg['environment']['record_video']:
            env.stop_recording_video()