Exemplo n.º 1
0
def main():
    test_or_train = TEST_OR_TRAIN
    assert test_or_train in ["train", "test"]

    env_params = {
        'time_step': TIME_STEP,
        'robot_class': QuadrupedRobot,
        'on_rack': False,
        'enable_self_collision': True,
        'motor_control_mode': MotorControlMode.HYBRID_COMPUTED_POS_TROT,
        'train_or_test': test_or_train
    }

    policy_save_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                   'data/policies')
    policy_save_filename = 'ppo_' + str(COUNT) + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    policy_save_path = os.path.join(policy_save_dir, policy_save_filename)

    policy_kwargs = {"net_arch": [{"pi": [512, 256], "vf": [512, 256]}]}

    if TEST_OR_TRAIN == "train":
        env = make_vec_env(env_change_input,
                           n_envs=NUM_CPUS,
                           seed=0,
                           env_kwargs=env_params,
                           vec_env_cls=SubprocVecEnv)
        env = VecNormalize(env)
        if not (os.path.exists(policy_save_dir)):
            os.makedirs(policy_save_dir)
        model = PPO('MlpPolicy', env, policy_kwargs=policy_kwargs, verbose=1)
        model.learn(total_timesteps=100000000)
        model.save(policy_save_path)
    else:
        # env = env_change_input(time_step=env_params['time_step'],
        #                        robot_class=env_params['robot_class'],
        #                        on_rack=env_params['on_rack'],
        #                        enable_self_collision=env_params['enable_self_collision'],
        #                        motor_control_mode=env_params['motor_control_mode'],
        #                        train_or_test=env_params['train_or_test'])
        env = env_change_input(**env_params)
        model_load_path = os.path.join(policy_save_dir,
                                       'ppo_3_17-03-2021_15-39-42')
        model = PPO.load(model_load_path)
        obs = env.reset()
        while True:
            action, _state = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            env.render()
            if done:
                obs = env.reset()
Exemplo n.º 2
0
class MultiModuleExp:
    """ 
    A whole experiment.
    It should contain: (1) environments, (2) policies, (3) training, (4) testing.
    The results should be able to compare with other experiments.

    The Multi-RNN experiment.
    """
    def __init__(
        self,
        args,
        env_id="HopperBulletEnv-v0",
        features_extractor_class=MultiExtractor,
        features_extractor_kwargs={},
    ) -> None:
        print("Starting MultiModuleExp")
        """ Init with parameters to control the training process """
        self.args = args
        self.env_id = env_id
        self.use_cuda = torch.cuda.is_available() and args.cuda
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        # Make Environments
        print("Making train environments...")
        venv = DummyVecEnv([
            make_env(env_id=env_id, rank=i, seed=args.seed, render=args.render)
            for i in range(args.num_envs)
        ])
        self.eval_env = DummyVecEnv(
            [make_env(env_id=env_id, rank=99, seed=args.seed, render=False)])
        if args.vec_normalize:
            venv = VecNormalize(venv)
            self.eval_env = VecNormalize(self.eval_env, norm_reward=False)

        features_extractor_kwargs["num_envs"] = args.num_envs
        policy_kwargs = {
            "features_extractor_class": features_extractor_class,
            "features_extractor_kwargs": features_extractor_kwargs,
            # Note: net_arch must be specified, because sb3 won't set the default network architecture if we change the features_extractor.
            # pi: Actor (policy-function); vf: Critic (value-function)
            "net_arch": [dict(pi=[64, 64], vf=[64, 64])],
        }

        self.model = CustomizedPPO(
            CustomizedPolicy,
            venv,
            n_steps=args.rollout_n_steps,
            tensorboard_log="tb",
            policy_kwargs=policy_kwargs,
            device=self.device,
            verbose=1,
            rnn_move_window_step=args.rnn_move_window_step,
            rnn_sequence_length=args.rnn_sequence_length,
            use_sde=args.sde,
            n_epochs=args.n_epochs)

    def train(self) -> None:
        """ Start training """
        print(f"train using {self.model.device.type}")

        callback = [
            DebugCallback("Customized"),
            AdjustCameraCallback(),
            WandbCallback(self.args),
            CustomizedEvalCallback(
                self.eval_env,
                best_model_save_path=None,
                log_path=None,
                eval_freq=self.args.eval_freq,
                n_eval_episodes=3,
                verbose=0,
            )
        ]
        self.model.learn(self.args.total_timesteps, callback=callback)

    def test(self, model_filename, vnorm_filename):
        self.model.load(model_filename)
        self.eval_env = VecNormalize.load(vnorm_filename, self.eval_env)
        self.eval_env.render()
        obs = self.eval_env.reset()
        with self.model.policy.features_extractor.start_testing():
            for i in range(1000):
                action = self.model.predict(obs, deterministic=True)
                self.eval_env.step(action)

        self.eval_env.close()