示例#1
0
    def learn(self,
              total_timesteps: int,
              callback: MaybeCallback = None,
              log_interval: int = 4,
              eval_env: Optional[GymEnv] = None,
              eval_freq: int = -1,
              n_eval_episodes: int = 5,
              tb_log_name: str = "SAC",
              eval_log_path: Optional[str] = None,
              reset_num_timesteps: bool = True) -> OffPolicyRLModel:

        callback = self._setup_learn(eval_env, callback, eval_freq,
                                     n_eval_episodes, eval_log_path, reset_num_timesteps)
        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout,
                                            n_steps=self.train_freq, action_noise=self.action_noise,
                                            callback=callback,
                                            learning_starts=self.learning_starts,
                                            replay_buffer=self.replay_buffer,
                                            log_interval=log_interval)

            if rollout.continue_training is False:
                break

            self._update_current_progress(self.num_timesteps, total_timesteps)

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(gradient_steps, batch_size=self.batch_size)

        callback.on_training_end()
        return self
示例#2
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0
        
        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        
        # debug ===============================================================
        if mode == 'debug':
            print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)'])
            
        
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)
            
            # debug ===========================================================
            if mode == 'debug':
                print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps])

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)
            # debug ===============================================================
            if mode == 'debug':
                print(['OPA.learn finished, ready to OPA.train'])
            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps, total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                self.fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations", iteration, exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                    # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                    logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0:
                    logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                    logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer]))
                logger.record("time/fps", self.fps)
                logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard")
                logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
示例#4
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 100,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        use_trajectory_buffer: bool=False
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        if use_trajectory_buffer:
            buffer = self.trajectory_buffer
        else:
            buffer = self.replay_buffer
        while self.num_timesteps < total_timesteps:
            if use_trajectory_buffer:
                buffer = self.trajectory_buffer
            else:
                buffer = self.replay_buffer
                
            ms = get_ms()
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                buffer=buffer,
                log_interval=log_interval,
            )
            # print("collect_time: ", get_ms()-ms)


            if rollout.continue_training is False:
                break
            
            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # ms = get_ms()
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
                # print('train_time: ', get_ms() - ms)
                # exit()
        callback.on_training_end()

        return self
示例#5
0
    def learn(self,
              total_timesteps: int,
              callback: MaybeCallback = None,
              log_interval: int = 1,
              eval_env: Optional[GymEnv] = None,
              eval_freq: int = -1,
              n_eval_episodes: int = 5,
              tb_log_name: str = "PPO",
              eval_log_path: Optional[str] = None,
              reset_num_timesteps: bool = True) -> 'PPO':

        iteration = 0
        callback = self._setup_learn(eval_env, callback, eval_freq,
                                     n_eval_episodes, eval_log_path, reset_num_timesteps)

        # if self.tensorboard_log is not None and SummaryWriter is not None:
        #     self.tb_writer = SummaryWriter(log_dir=os.path.join(self.tensorboard_log, tb_log_name))

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(self.env, callback,
                                                      self.rollout_buffer,
                                                      n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress(self.num_timesteps, total_timesteps)

            # Display training infos
            if self.verbose >= 1 and log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.logkv("iterations", iteration)
                if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                    logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer]))
                    logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer]))
                logger.logkv("fps", fps)
                logger.logkv('time_elapsed', int(time.time() - self.start_time))
                logger.logkv("total timesteps", self.num_timesteps)
                logger.dumpkvs()

            self.train(self.n_epochs, batch_size=self.batch_size)

            # For tensorboard integration
            # if self.tb_writer is not None:
            #     self.tb_writer.add_scalar('Eval/reward', mean_reward, self.num_timesteps)

        callback.on_training_end()

        return self
示例#6
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps,
            eval_env,
            callback,
            eval_freq,
            n_eval_episodes,
            eval_log_path,
            reset_num_timesteps,
            tb_log_name,
        )

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                gradient_steps = self.gradient_steps if self.gradient_steps >= 0 else rollout.episode_timesteps
                # Special case when the user passes `gradient_steps=0`
                if gradient_steps > 0:
                    self.train(batch_size=self.batch_size,
                               gradient_steps=gradient_steps)

        callback.on_training_end()

        return self
示例#7
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "HER",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> BaseAlgorithm:

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)
        self.model.start_time = self.start_time
        self.model.ep_info_buffer = self.ep_info_buffer
        self.model.ep_success_buffer = self.ep_success_buffer
        self.model.num_timesteps = self.num_timesteps
        self.model._episode_num = self._episode_num
        self.model._last_obs = self._last_obs
        self.model._total_timesteps = self._total_timesteps

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            rollout = self.collect_rollouts(
                self.env,
                n_episodes=self.n_episodes_rollout,
                n_steps=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts and self.replay_buffer.size(
            ) > 0:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(batch_size=self.batch_size,
                           gradient_steps=gradient_steps)

        callback.on_training_end()

        return self
示例#8
0
文件: dqn.py 项目: atulad/AFRL_Copy
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        # AF (could put this in callback.on_training_start)
        self.episode_forecast = []
        self.episode_forecasts = []
        self.plan, self.forecasts = self._replan(
            self._last_obs[0], self.empty_plan(),
            self.zero_forecasts)  # VecEnv resets automatically
        # AF END

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(batch_size=self.batch_size,
                           gradient_steps=gradient_steps)

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name
        )

        callback.on_training_start(locals(), globals())
        last_tested = 0

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )
            for e in self.env.envs:
                e.env.train_return = rollout.episode_reward

            if rollout.continue_training is False:
                break
            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                last_tested += 1
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
                if last_tested > 5:
                    last_tested = 0
                    test_return = self.test(num_episodes=3)
                    logger.record("rollout/test_rew_mean", test_return)

        callback.on_training_end()

        return self
示例#10
0
    def learn(self,
              total_timesteps: int,
              callback: MaybeCallback = None,
              log_interval: int = 4,
              eval_env: Optional[GymEnv] = None,
              eval_freq: int = -1,
              n_eval_episodes: int = 5,
              tb_log_name: str = "AWAC",
              eval_log_path: Optional[str] = None,
              reset_num_timesteps: bool = True) -> OffPolicyAlgorithm:

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)
        callback.on_training_start(locals(), globals())

        self.pretrain_bc(int(1e3), batch_size=self.batch_size)
        observations, actions, next_observations, rewards, dones = self.bc_buffer.observations, self.bc_buffer.actions, self.bc_buffer.next_observations, self.bc_buffer.rewards, self.bc_buffer.dones
        for data in zip(observations, next_observations, actions, rewards,
                        dones):
            self.replay_buffer.add(*data)
        self.pretrain_rl(int(1e4), batch_size=self.batch_size)

        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                n_episodes=self.n_episodes_rollout,
                n_steps=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval)

            if rollout.continue_training is False:
                break

            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train(gradient_steps, batch_size=self.batch_size)

        callback.on_training_end()
        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))

                    for k in self.ep_info_buffer[0].keys():
                        if k not in "lrt":
                            logger.record(
                                f"progress/{k}",
                                safe_mean([
                                    ep_info[k]
                                    for ep_info in self.ep_info_buffer
                                ]))

                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

                if iteration % (log_interval *
                                10) == 0:  #save parameters every 10 log steps
                    self.save('./interim_trained_models/')

            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "PPO",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:
            """Replay buffer size"""
            ### No need to use larger buffer, because that doesn't solve the catastrophic forgetting problem.
            ### For this experiment, just count the best score is enough.
            # Determine buffer size using safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])
            # I want it to be stable when learned walking.
            # Start with small buffer, once
            # ep_len_mean = safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])
            # if ep_len_mean>=1000:
            #     self.use_small_buffer = False

            if not args.single and self.use_small_buffer:
                output(
                    f"Collect rollouts for {self.n_steps//self.env.num_envs} steps.",
                    2)
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer_small,
                    n_rollout_steps=self.n_steps // self.env.num_envs)
            else:
                output(f"Collect rollouts for {self.n_steps} steps.", 2)
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer,
                    n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "run",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OffPolicyAlgorithm":

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        # train vae
        print("Train VAE...")
        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                print("T VAE")
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train_vae(batch_size=self.batch_size,
                               gradient_steps=gradient_steps)
            """
            gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
            self.train_vae(batch_size=self.batch_size, gradient_steps=gradient_steps)
            """
        # train mdnrnn
        print("Train MDNRNN...")
        self.replay_buffer = ReplayBufferAD(
            self.buffer_size,
            self.observation_space,
            self.action_space,
            self.device,
            optimize_memory_usage=self.optimize_memory_usage,
        )

        total_timesteps = 30
        while self.num_timesteps < total_timesteps:
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                print("T MDNRNN")
                gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
                self.train_mdnrnn(batch_size=self.batch_size,
                                  gradient_steps=gradient_steps)
            """
            gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps
            self.train_mdnrnn(batch_size=self.batch_size, gradient_steps=gradient_steps)
            """
        # train controller
        print("Train Controller...")
        p_queue = Queue()
        r_queue = Queue()
        e_queue = Queue()
        num_workers = 16

        for p_index in range(num_workers):
            Process(target=self.slave_routine,
                    args=(p_queue, r_queue, e_queue, p_index)).start()

        cur_best = None

        parameters = self.controller.parameters()
        es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1,
                                      {'popsize': 4})

        epoch = 0
        log_step = 3
        while not es.stop():
            if cur_best is not None and -cur_best > 950:
                print("Already better than target, breaking...")
                break

            r_list = [0] * 4  # result list
            solutions = es.ask()
            # push parameters to queue
            i = 0
            for s_id, s in enumerate(solutions):
                for _ in range(4):
                    i += 1
                    p_queue.put((s_id, s))

            # retrieve results
            for _ in range(16):
                while r_queue.empty():
                    sleep(.1)
                r_s_id, r = r_queue.get()
                r_list[r_s_id] += r / 4

            es.tell(solutions, r_list)
            es.disp()
            # evaluation and saving
            if epoch % log_step == log_step - 1:
                best_params, best, std_best = self.evaluate(
                    p_queue, r_queue, solutions, r_list)
                print("Current evaluation: {}".format(best))
                if not cur_best or cur_best > best:
                    cur_best = best
                    print("Saving new best with value {}+-{}...".format(
                        -cur_best, std_best))
                    load_parameters(best_params, self.controller)
                if -best > 950:
                    print("Terminating controller training with value {}...".
                          format(best))
                    break

            epoch += 1

        es.result_pretty()
        e_queue.put('EOP')

        callback.on_training_end()

        return self
示例#14
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        print('setup training')

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        print(f'start training, total timesteps is {total_timesteps}')

        while self.num_timesteps < total_timesteps:

            print(f'num timesteps: {self.num_timesteps}/{total_timesteps}')
            print(f'collect rollouts, rollout steps = {self.n_steps}')

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                print(
                    'stop training (only happens if callback on_step returns false)'
                )
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            print('display training infos')
            # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}')

            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            print('train')
            self.train()

        callback.on_training_end()

        return self
示例#15
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            for partner_idx in range(self.policy.num_partners):
                try:
                    self.env.envs[0].switch_to_env(partner_idx)
                except:
                    pass
                continue_training = self.collect_rollouts(
                    self.env,
                    callback,
                    self.rollout_buffer[partner_idx],
                    n_rollout_steps=self.n_steps,
                    partner_idx=partner_idx)
            #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            # Collect n_steps (e.g. 512) number of steps. Total timesteps = n_steps * num_envs (e.g. 512 * 8 = 4096)
            # Hence each rollout has a total of 4096 timesteps
            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                #logger.record("rollout/ep_rew_mean", safe_mean([goal_diff for goal_diff in self.ep_info_buffer]))
                #if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0:
                #    logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]))
                #    logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]))
                #logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            # Save model every 50 iterations
            if iteration > 0 and iteration % 50 == 0:
                # Save Pytorch Model locally
                if self.model_checkpoints_path is not None:
                    th.save(
                        self.policy.state_dict(),
                        self.model_checkpoints_path + f"/model_v{iteration}")

                    # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard
                    if self.log_handler is not None:
                        self.log_handler.save(
                            self.model_checkpoints_path +
                            f"/model_v{iteration}",
                            base_path=self.model_checkpoints_path)

            # Save the best model if achieve a new high score
            if self.save_best_model:
                print(
                    f"Model achieve best score: {self.best_score} at iteration {iteration}"
                )

                # Save Pytorch Model locally
                if self.model_checkpoints_path is not None:
                    th.save(self.policy.state_dict(),
                            self.model_checkpoints_path + "/model_bestscore")

                    # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard
                    if self.log_handler is not None:
                        self.log_handler.save(
                            self.model_checkpoints_path + "/model_bestscore",
                            base_path=self.model_checkpoints_path)

                self.save_best_model = False

            # PPO Training
            self.train()

        callback.on_training_end()

        return self
示例#17
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        from stable_baselines3.common.utils import obs_as_tensor, safe_mean
        import time
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int((self.num_timesteps - self._num_timesteps_at_start) /
                          (time.time() - self.start_time))
                self.logger.record("time/iterations",
                                   iteration,
                                   exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    self.logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    self.logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                self.logger.record("time/fps", fps)
                self.logger.record("time/time_elapsed",
                                   int(time.time() - self.start_time),
                                   exclude="tensorboard")
                self.logger.record("time/total_timesteps",
                                   self.num_timesteps,
                                   exclude="tensorboard")
                # [RLA] set timesteps
                time_step_holder.set_time(self.num_timesteps)
                self.logger.dump()

            self.train()

        callback.on_training_end()

        return self
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            rollout = self.collect_rollouts(
                self.env,
                n_episodes=-1,
                n_steps=1,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if rollout.continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                self.train(gradient_steps=1, batch_size=self.batch_size)

        callback.on_training_end()

        return self
示例#19
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        param_noise: bool = False,
        sigma: float = 0.1,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        while self.num_timesteps < total_timesteps:

            #during rollout we collect batches of states and rewards
            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                param_noise=param_noise,
                sigma=sigma)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            # during training gradient descent is done
            self.train(param_noise, sigma)

            if param_noise:
                sigma = self.update_sigma(sigma)
                # print("current_sigma")
                # print(sigma)

        callback.on_training_end()

        return self
示例#20
0
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 1,
        eval_env: Optional[GymEnv] = None,
        eval_freq: int = -1,
        n_eval_episodes: int = 5,
        tb_log_name: str = "OnPolicyAlgorithm",
        eval_log_path: Optional[str] = None,
        reset_num_timesteps: bool = True,
        parameter_noise: bool = False,
    ) -> "OnPolicyAlgorithm":
        iteration = 0

        total_timesteps, callback = self._setup_learn(
            total_timesteps, eval_env, callback, eval_freq, n_eval_episodes,
            eval_log_path, reset_num_timesteps, tb_log_name)

        callback.on_training_start(locals(), globals())

        #initiatilizing value of noise std
        current_sigma = 1.0
        while self.num_timesteps < total_timesteps:

            continue_training = self.collect_rollouts(
                self.env,
                callback,
                self.rollout_buffer,
                n_rollout_steps=self.n_steps,
                parameter_noise=parameter_noise,
                sigma=0.5)

            if continue_training is False:
                break

            iteration += 1
            self._update_current_progress_remaining(self.num_timesteps,
                                                    total_timesteps)

            # Display training infos
            if log_interval is not None and iteration % log_interval == 0:
                fps = int(self.num_timesteps / (time.time() - self.start_time))
                logger.record("time/iterations",
                              iteration,
                              exclude="tensorboard")
                if len(self.ep_info_buffer) > 0 and len(
                        self.ep_info_buffer[0]) > 0:
                    logger.record(
                        "rollout/ep_rew_mean",
                        safe_mean(
                            [ep_info["r"] for ep_info in self.ep_info_buffer]))
                    logger.record(
                        "rollout/ep_len_mean",
                        safe_mean(
                            [ep_info["l"] for ep_info in self.ep_info_buffer]))
                logger.record("time/fps", fps)
                logger.record("time/time_elapsed",
                              int(time.time() - self.start_time),
                              exclude="tensorboard")
                logger.record("time/total_timesteps",
                              self.num_timesteps,
                              exclude="tensorboard")
                logger.dump(step=self.num_timesteps)

            self.train()

            if parameter_noise:

                states = self.rollout_buffer.observations
                states = th.tensor(states)

                actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy(
                    states, parameter_noise=False)
                actions_noisy, values_noisy, log_prob_noisy = self.policy(
                    states, parameter_noise=True, sigma=current_sigma)

                distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5

                distance_threshold = 1
                sigma_scalefactor = 1.01
                if distance > distance_threshold:
                    current_sigma /= sigma_scalefactor
                else:
                    current_sigma *= sigma_scalefactor

        callback.on_training_end()

        return self