예제 #1
0
def main(_):
    policy_dir = os.path.join(gym_dir_path, POLICIES[FLAGS.env][0])
    config = utility.load_config(policy_dir)
    policy_layers = config.policy_layers
    value_layers = config.value_layers
    env = config.env(render=True)
    network = config.network

    with tf.Session() as sess:
        agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                 env,
                                                 network,
                                                 policy_layers=policy_layers,
                                                 value_layers=value_layers,
                                                 checkpoint=os.path.join(
                                                     policy_dir,
                                                     POLICIES[FLAGS.env][1]))

        sum_reward = 0
        observation = env.reset()
        while True:
            action = agent.get_action([observation])
            observation, reward, done, _ = env.step(action[0])
            time.sleep(0.002)
            sum_reward += reward
            if done:
                break
        tf.logging.info("reward: %s", sum_reward)
예제 #2
0
 def play(self):
     policy_dir = os.path.join(
         self.gym_dir_path, action_mapper.ENV_ID_TO_POLICY[self.env_id][0])
     config = utility.load_config(policy_dir)
     policy_layers = config.policy_layers
     value_layers = config.value_layers
     env = config.env(render=True, **self.args)
     network = config.network
     checkpoint = os.path.join(
         policy_dir, action_mapper.ENV_ID_TO_POLICY[self.env_id][1])
     with tf.Session() as sess:
         agent = simple_ppo_agent.SimplePPOPolicy(
             sess,
             env,
             network,
             policy_layers=policy_layers,
             value_layers=value_layers,
             checkpoint=checkpoint)
         sum_reward = 0
         observation = env.reset()
         while True:
             action = agent.get_action([observation])
             observation, reward, done, _ = env.step(action[0])
             time.sleep(0.002)
             sum_reward += reward
             logging.info(f"Reward={sum_reward}")
             if done:
                 break
예제 #3
0
def main(_):
    """Create or load configuration and launch the trainer."""
    utility.set_up_logging()
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    logdir = FLAGS.logdir and os.path.expanduser(
        os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp,
                                                  FLAGS.config)))
    try:
        config = utility.load_config(logdir)
    except IOError:
        config = AttrDict(getattr(configs, FLAGS.config)())
        config = utility.save_config(config, logdir)
    for score in train(config, FLAGS.env_processes):
        tf.logging.info('Score {}.'.format(score))
예제 #4
0
 def play(self):
     if self.signal_type:
         self.args['signal_type'] = self.signal_type
     else:
         self.signal_type = flag_mapper.DEFAULT_SIGNAL[self.env_id]
     policy_id = f"{self.env_id}_{self.signal_type}"
     policy_path = flag_mapper.ENV_ID_TO_POLICY[policy_id][0]
     # policy_dir = os.path.join(self.gym_dir_path, policy_path)
     policy_dir = os.path.join('rex-gym/', policy_path)
     print(policy_dir)
     config = utility.load_config(policy_dir)
     policy_layers = config.policy_layers
     value_layers = config.value_layers
     env = config.env(render=True, **self.args)
     network = config.network
     checkpoint = os.path.join(policy_dir, flag_mapper.ENV_ID_TO_POLICY[policy_id][1])
     with tf.Session() as sess:
         agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                  env,
                                                  network,
                                                  policy_layers=policy_layers,
                                                  value_layers=value_layers,
                                                  checkpoint=checkpoint)
         sum_reward = 0
         observation = env.reset()
         while True:
             action = agent.get_action([observation])
             # print('Action', action)
             motor = env._transform_action_to_motor_command(action[0])
             print(' '.join(map(str, motor)))
             observation, reward, done, _ = env.step(action[0])
             # print('Reward', reward)
             # print('Observation', observation)
             time.sleep(0.002)
             sum_reward += reward
             logging.info(f"Reward={sum_reward}")
             if done:
                 break
예제 #5
0
 def set_gait(self, action_id, args=None, simulation=False):
     # @TODO Handle parametric policies
     # policy_dir, policy_check = action_mapper.DYNAMIC_ACTIONS_MAP[action_id]
     # with tempfile.TemporaryDirectory() as workpath:
     #     j2_env = Environment(loader=FileSystemLoader(workpath),
     #                          trim_blocks=True)
     #     lib_dir_path = os.path.join(str(site.getsitepackages()[0]), 'core')
     #     args['env_name'] = action_mapper.ACTIONS_TO_ENV_NAMES[action_id]
     #     config = j2_env.get_template(lib_dir_path + '/util/templates/config.j2').render(args)
     #     agent = self._load_agents(config, policy_dir + "/" + policy_check, False)
     # ------------------------------------------------------------------
     if action_id in self.STATIC_AGENTS:
         # this is a very poor hardcoded path
         # @ TODO
         gym_dir_path = '/usr/local/lib/python3.6/dist-packages/'
         # --------------------------------------------------------------
         log_dir, checkpoint = action_mapper.STATIC_ACTIONS_MAP[action_id]
         log_dir = os.path.join(gym_dir_path, log_dir)
         config = utility.load_config(log_dir)
         policy_layers = config.policy_layers
         value_layers = config.value_layers
         env = config.env(render=False)
         network = config.network
         with tf.Session() as sess:
             agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                      env,
                                                      network,
                                                      policy_layers=policy_layers,
                                                      value_layers=value_layers,
                                                      checkpoint=os.path.join(log_dir, checkpoint))
             logging.info(f'start gait_id={action_id}.')
             env.reset()
             while True:
                 obs = self._observer.get_observations(simulation)
                 action = agent.get_action([obs])
                 _, _, done, info = env.step(action[0])
                 self._actuator.set(info['action'], 0.003)
예제 #6
0
    def play(self):
        if self.signal_type:
            self.args['signal_type'] = self.signal_type
        else:
            self.signal_type = flag_mapper.DEFAULT_SIGNAL[self.env_id]
        policy_id = f"{self.env_id}_{self.signal_type}"
        policy_path = flag_mapper.ENV_ID_TO_POLICY[policy_id][0]
        policy_dir = os.path.join(self.gym_dir_path, policy_path)
        config = utility.load_config(policy_dir)
        policy_layers = config.policy_layers
        value_layers = config.value_layers
        env = config.env(render=False, **self.args)
        video_path = self.gym_dir_path + "/videos/"
        if not os.path.exists(video_path):
            os.mkdir(video_path)
        time_str = datetime.now()
        recorder = VideoRecorder(
            env, self.gym_dir_path + "/videos/{}.mp4".format(time_str))
        network = config.network
        checkpoint = os.path.join(policy_dir,
                                  flag_mapper.ENV_ID_TO_POLICY[policy_id][1])
        with tf.Session() as sess:
            agent = simple_ppo_agent.SimplePPOPolicy(
                sess,
                env,
                network,
                policy_layers=policy_layers,
                value_layers=value_layers,
                checkpoint=checkpoint)
            sum_reward = 0
            observation = env.reset()
            cnt = 0
            start = time.time()
            while True:
                action = agent.get_action([observation])
                observation, reward, done, _ = env.step(action[0])
                # if cnt > 100 and cnt % 1 == 0:
                cap_start = time.time()
                # recorder.capture_frame()
                env.render()
                cap_end = time.time()
                # print("cap time: {}".format(cap_end - cap_start))

                # img = env.render()
                # img_path = "images/img_{}.png".format(time_str)
                # img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                # cv2.imwrite(img_path, img)
                # depth_path = "images/depth_{}.png".format(time_str)
                # cv2.imwrite(depth_path, depth*125)

                # depth = open3d.geometry.Image(depth)
                # pcd = open3d.geometry.PointCloud.create_from_depth_image(depth, intrin)
                # np.savetxt("images/pcd.xyz", pcd.points)
                # exit()
                cnt += 1
                time.sleep(0.002)
                sum_reward += reward
                logging.info(f"Reward={sum_reward}")
                time_passed = time.time() - start
                # print("\r step: {}; reward: {}; rate: {}".format(cnt, sum_reward, sum_reward/cnt))
                print("\r step: {}; time: {}; fps: {}".format(
                    cnt, time_passed, cnt / time_passed))
                if done or cnt > 500:
                    break

            recorder.close()