Exemplo n.º 1
0
 def load_index(self, index, load_path=None):
     file_list = os.listdir(self.def_path_pre)
     file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)), reverse=True)
     if load_path is None:
         load_path = os.path.join(self.def_path_pre, file_list[index])
     load_variables(load_path=load_path, sess=self.sess)
     print('load_path: ', load_path)
Exemplo n.º 2
0
 def load_newest(self, load_path=None):
     file_list = os.listdir(self.def_path_pre)
     file_list.sort(key=lambda x: os.path.getmtime(os.path.join(self.def_path_pre, x)))
     if load_path is None:
         load_path = os.path.join(self.def_path_pre, file_list[-1])
     load_variables(load_path=load_path, sess=self.sess)
     print('load_path: ', load_path)
Exemplo n.º 3
0
    def learn(self,
              total_timesteps=int(1e6),
              log_interval=200,
              pretrain_load_path=None):
        """
        Parameters:
        -----------

        log_interval: int, specifies how frequently the logs are printed out (default: 100)

        pretrain_load_path: pre-train model load path

        """

        set_global_seeds(self.seed)
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubuntu18_04/programming/RL_Snake_Robot/algorithm/RL_algorithm/a2c/tmp/Y2020M06D29_h19m43s27"
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubuntu18_04/programming/RL_Snake_Robot/algorithm/RL_algorithm/a2c/tmp/Y2020M06D29_h20m18s25"
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubuntu18_04/programming/RL_Snake_Robot/algorithm/RL_algorithm/a2c/tmp/Y2020M07D11_h18m12s00"
        if pretrain_load_path is not None:
            load_variables(load_path=pretrain_load_path, sess=self.sess)

        # Instantiate the runner object
        runner = Runner(self.env, self, nsteps=self.nsteps, gamma=self.gamma)
        epinfobuf = deque(maxlen=100)
        # Calculate the batch_size
        nbatch = self.nenvs * self.nsteps

        # Start total timer
        tstart = time.time()

        with TensorboardWriter(self.graph, self.tb_log_path, 'A2C') as writer:
            for update in range(1, total_timesteps):
                # if update % learn_frequency != 0:
                #     runner.run()
                #     continue

                # Get mini batch of experiences
                obs, rewards, masks, actions, values, epinfos = runner.run()
                policy_loss, value_loss, policy_entropy, grads = self.train(
                    obs, rewards, masks, actions, values, update, writer)
                epinfobuf.extend(epinfos)
                nseconds = time.time() - tstart
                # Calculate the fps (frame per second)
                fps = int((update * nbatch) / nseconds)

                if writer is not None:
                    total_episode_reward_logger(
                        self.episode_reward,
                        rewards.reshape((self.nenvs, self.nsteps)),
                        masks.reshape((self.nenvs, self.nsteps)), writer,
                        update)

                if update % log_interval == 0 or update == 1:
                    # Calculates if value function is a good predicator of the returns (ev > 1)
                    # or if it's just worse than predicting nothing (ev =< 0)
                    ev = explained_variance(values, rewards)
                    logger.record_tabular("nupdates", update)
                    logger.record_tabular("total_timesteps", update * nbatch)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular("policy_entropy",
                                          float(policy_entropy))
                    logger.record_tabular("value_loss", float(value_loss))
                    logger.record_tabular("policy_loss", float(policy_loss))
                    logger.record_tabular("explained_variance", float(ev))
                    logger.record_tabular(
                        "eprewmean",
                        safe_mean([epinfo['r'] for epinfo in epinfobuf]))
                    logger.record_tabular(
                        "eplenmean",
                        safe_mean([epinfo['l'] for epinfo in epinfobuf]))
                    logger.dump_tabular()

                if update % 200 == 0 or update == total_timesteps // nbatch:
                    if self.model_save_path is None:
                        file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S',
                                                  time.localtime(time.time()))
                        model_save_path = self.def_path_pre + file_name
                        self.save(model_save_path)
                    else:
                        self.save(self.model_save_path)

                    # print("grads = ", grads[-10:])

        return self
Exemplo n.º 4
0
    def learn(self,
              total_timesteps=int(1e6),
              log_interval=4,
              reset_num_timesteps=True):
        pretrain_load_path = None
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubunt
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubunt
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubuntu18_04/programming/RL_Snake_Robot/algorithm/RL_algorithm/sac/tmp/Y2020M07D16_h11m15s56"
        # pretrain_load_path="/home/huangjp/CoppeliaSim_Edu_V4_0_0_Ubuntu18_04/programming/RL_Snake_Robot/algorithm/RL_algorithm/sac/tmp/Y2020M07D16_h18m36s16"
        if pretrain_load_path is not None:
            variables = self.params + self.target_params
            load_variables(load_path=pretrain_load_path,
                           variables=variables,
                           sess=self.sess)

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with TensorboardWriter(self.graph,
                               self.tensorboard_log_path,
                               tb_log_name="SAC",
                               new_tb_log=new_tb_log) as writer:

            # Transform to callable if needed
            # self.learning_rate_scheduler = Scheduler(initial_value=self.learning_rate, n_values=total_timesteps,
            #                                          schedule=self.learning_rate_scheduler)
            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            # Retrieve unnormalized observation for saving into the buffer
            obs = self.env.reset()

            n_updates = 0
            infos_values = []

            for update in range(total_timesteps):
                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if self.num_timesteps < self.learning_start_threshold:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used

                    # action = np.array([self.env.action_space.sample() for _ in range(self.n_envs)])
                    action = np.array([
                        np.random.random(self.env.action_space.shape)
                        for _ in range(self.n_envs)
                    ])
                    # action = scale_action(self.env.action_space, unscaled_action)
                else:
                    action, _, _, _ = self.policy.step(obs,
                                                       deterministic=False)
                    mu, std = self.policy.proba_step(obs)
                    if update % 500 == 0:
                        print("mu = ", mu, " , std = ", std)
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if self.action_noise is not None:
                        action = action + self.action_noise()
                    # inferred actions need to be transformed to environment action_space before stepping
                    # unscaled_action = unscale_action(self.env.action_space, action)
                # print("action = ", action)
                assert action[0].shape == self.env.action_space.shape
                new_obs, reward, done, infos = self.env.step(action)
                self.num_timesteps += 1

                # Store only the unnormalized version
                obs_, new_obs_, reward_ = obs, new_obs, reward

                # Store transition in the replay buffer.
                self.replay_buffer.extend(obs_, action, reward_, new_obs_,
                                          done)
                obs = new_obs

                # Retrieve reward and episode length if using Monitor wrapper
                for info in infos:
                    maybe_ep_info = info.get('episode')
                    if maybe_ep_info:
                        self.ep_info_buf.append(maybe_ep_info)

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward_]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    tb_utils.total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if self.num_timesteps % self.train_freq == 0:

                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_start_threshold:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        # current_lr = self.learning_rate_scheduler.value()
                        # Update policy and critics (q functions)
                        mb_infos_vals.append(self._train_step(update, writer))
                        # Update target network
                        if (update +
                                grad_step) % self.target_update_interval == 0:
                            # Update target network
                            self.sess.run(self.target_update_op)
                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                # only record the return reward of the first environment
                episode_rewards[-1] += reward_[0]

                if done[0]:
                    #     if self.action_noise is not None:
                    #         self.action_noise.reset()
                    #         obs = self.env.reset()
                    episode_rewards.append(0.0)
                    print("The first env's reward: ", episode_rewards[-2])

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # Display training infos
                if done[0] and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(update / (time.time() - start_time))
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_reward)
                    if len(self.ep_info_buf) > 0 and len(
                            self.ep_info_buf[0]) > 0:
                        logger.record_tabular(
                            'ep_rewmean',
                            safe_mean([
                                ep_info['r'] for ep_info in self.ep_info_buf
                            ]))
                        logger.record_tabular(
                            'ep_lenmean',
                            safe_mean([
                                ep_info['l'] for ep_info in self.ep_info_buf
                            ]))
                    logger.record_tabular("n_updates", n_updates)
                    logger.record_tabular("fps", fps)
                    logger.record_tabular('time_elapsed',
                                          int(time.time() - start_time))
                    # if len(episode_successes) > 0:
                    #     logger.record_tabular("success rate", episode_successes / num_episodes)
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.record_tabular(name, val)
                    logger.record_tabular("total timesteps",
                                          self.num_timesteps)
                    logger.dump_tabular()
                    # Reset infos:
                    infos_values = []

                if update % 1000 == 0 or update == total_timesteps - 1:
                    if self.model_save_path is None:
                        file_name = time.strftime('Y%YM%mD%d_h%Hm%Ms%S',
                                                  time.localtime(time.time()))
                        model_save_path = self.def_path_pre + file_name
                        self.save(model_save_path)
                    else:
                        self.save(self.model_save_path)

            return self