Пример #1
0
    def Imitation_Learning(self, step_time, data=None, policy=None, verbose=2):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception(
                "The IL only need one way to guide, Please make sure the input "
            )

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            s = self.env.reset()
            for time in step_time:
                self.step += 1
                a = policy(s)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                loss = self.backward(sample)
                s = s_
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()
Пример #2
0
    def training_with_policy(self,
                             expert_policy,
                             max_imitation_learning_step=1e5,
                             max_ep_cycle=2000,
                             buffer_size=32):
        self.step = 0
        s = self.env.reset()
        loss_BC = 0
        ep_step, ep_reward, ep_loss = 0, 0, 0
        expert_action_set, policy_action_set = [], []

        for _ in range(max_imitation_learning_step):
            self.step += 1
            ep_step += 1
            a_expert = expert_policy(s)
            a_policy = self.policy_network.forward(s)

            expert_action_set.append(torch.tensor(a_expert))
            policy_action_set.append(a_policy)
            s_, r, done, info = self.env.step(a_policy)
            ep_reward += r
            sample = {
                "s": s,
                "a": a_policy,
                "a_expert": a_expert,
                "s_": s_,
                "r": r,
                "tr": done
            }
            s = s_[:]

            if len(policy_action_set) > buffer_size:

                loss = self.loss_cal(expert_action_set, policy_action_set)
                ep_loss += loss.cpu().detach().numpy()
                self.policy_model_optim.zero_grad()
                loss.backward()
                self.policy_model_optim.step()

            if done or ep_step > max_ep_cycle:
                ep_step = 0
                logger.record_tabular("steps", self.step)
                logger.record_tabular("loss", ep_loss)
                logger.record_tabular("loss", ep_reward)
Пример #3
0
    def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000,
                 render = False, verbose=1, record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:
        self.render = render

        # .....................initially——recode...........................#
        rollout = 0
        now_best_reward = -np.inf

        self.dist = make_pdtype(self.env.action_space, self.policy)
        sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable)
        while self.step < max_step:
            sample = next(sample_generate)
            logger.record_tabular("01.step", self.step)
            logger.record_tabular("02.episode",self.episode)
            logger.record_tabular("03.rollout", rollout)
            logger.record_tabular("04.rollout/ep", sample["ep_used"])
            logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"]))
            logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"]))
            logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"]))
            logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"]))
            logger.dump_tabular()
            csv_record(sample["ep_reward"], self.path)
            record_sample = sample["buffer"]

            rollout += 1

            if self.step > learning_start and self.learning:
                ep_show = {}
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] = 0
                rollout_loss = 0
                for time in range(train_rollout):
                    loss, other_infor = self.update(record_sample)
                    if verbose == 1:
                        logger.record_tabular("06.train_rollout", time)
                        logger.record_tabular("07.loss", loss)
                        flag = 10
                        if self.backward_step_show_list:
                            for key in self.backward_step_show_list:
                                logger.record_tabular(str(flag) +"."+ key, other_infor[key])
                                flag += 1
                        logger.dump_tabular()
                    rollout_loss += loss
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            ep_show[key] += other_infor[key]
                if verbose == 2:
                    logger.record_tabular("06.rollouts/loss", rollout_loss)
                    logger.record_tabular("07.rollouts/episode_Q_value", torch.mean(
                        torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy())
                    # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"])
                    # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"])
                    # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"]))

                    flag = 10
                    if self.backward_ep_show_list:
                        for key in self.backward_ep_show_list:
                            logger.record_tabular(str(flag) + "." + key, ep_show[key])
                            flag += 1
                    logger.dump_tabular()
            if np.mean(sample["ep_reward"])>now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved")
                now_best_reward = np.mean(sample["ep_reward"])
Пример #4
0
    def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000,
                           buffer_size = 5000, value_training_round = 10, value_training_fre = 2500,
                           verbose=2,render = False):
        '''
        :param data:  the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr
        sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
        :param policy:
        :return:
        '''
        if data is not None and policy is not None:
            raise Exception("The IL only need one way to guide, Please make sure the input ")

        if data is not None:
            for time in step_time:
                self.step += 1
                loss = self.backward(data[time])
                if verbose == 1:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("loss", loss)
                    logger.dumpkvs()

        if policy is not None:
            buffer = ReplayMemory(buffer_size)
            s = self.env.reset()
            loss_BC = 0
            ep_step,ep_reward = 0, 0
            for _ in range(step_time):
                self.step += 1
                ep_step += 1
                a = policy(self.env)
                s_, r, done, info = self.env.step(a)
                #print(r,info)
                ep_reward += r
                if render:
                    self.env.render()
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                buffer.push(sample)
                s = s_[:]
                if self.step > learning_start:
                    sample_ = buffer.sample(self.batch_size)
                    loss = self.policy_behavior_clone(sample_)
                    if self.step % value_training_fre==0:
                        record_sample = {}
                        for key in buffer.memory.keys():
                            record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:]
                        record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"]))
                        returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"],
                                                   self.gamma, self.lam)
                        record_sample["advs"] = advants
                        record_sample["return"] = returns
                        for round_ in range(value_training_round):
                            loss_value = self.value_pretrain(record_sample, value_training_fre)
                            print(round_, loss_value)

                    if verbose == 1:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("loss", loss)
                        logger.record_tabular("rewrad",r)
                        logger.dumpkvs()
                    loss_BC += loss
                if done:
                    if verbose == 2:
                        logger.record_tabular("learning_steps", self.step)
                        logger.record_tabular("step_used", ep_step)
                        logger.record_tabular("loss", loss_BC/ep_step)
                        logger.record_tabular("ep_reward",ep_reward )
                        logger.dumpkvs()

                    s = self.env.reset()
                    loss_BC = 0
                    ep_step,ep_reward = 0, 0
Пример #5
0
    def interact(self,
                 max_step=50000,
                 max_ep_cycle=2000,
                 render=False,
                 verbose=1,
                 record_ep_inter=None):
        '''
        :param max_step:
        :param max_ep_time:
        :param max_ep_cycle:  max step in per circle
        .........................show parameter..................................
        :param verbose
        if verbose == 1   show every ep
        if verbose == 2   show every step
        :param record_ep_inter
        record_ep_interact data
        :return: None
        '''
        # if IL_time is not None:

        # .....................initially——recode...........................#
        ep_reward = []
        ep_Q_value = []
        ep_loss = []
        now_best_reward = -np.inf
        while self.step < max_step:
            s = self.env.reset()
            'reset the ep record'
            ep_r, ep_q, ep_l = 0, 0, 0
            'reset the RL flag'
            ep_cycle, done = 0, 0
            ep_show = {}
            if self.backward_ep_show_list:
                for key in self.backward_ep_show_list:
                    ep_show[key] = 0
            self.episode += 1
            while done == 0 and ep_cycle < max_ep_cycle:
                self.step += 1
                ep_cycle += 1
                'the interaction part'
                a, Q, info_forward = self.forward(s)
                # print(a)
                s_, r, done, info = self.env.step(a)
                sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done}
                s = deepcopy(s_)
                loss, info_backward = self.backward(sample)
                if render:
                    self.env.render()
                'the record part'

                if verbose == 1 and self.step > self.learning_starts:
                    logger.record_tabular("steps", self.step)
                    logger.record_tabular("episodes", self.episode)
                    logger.record_tabular("loss", loss)
                    logger.record_tabular("reward", r)
                    logger.record_tabular("Q", Q)
                    if self.forward_step_show_list:
                        for key in self.forward_step_show_list:
                            logger.record_tabular(key, info_forward[key])
                    if self.backward_step_show_list:
                        for key in self.backward_step_show_list:
                            logger.record_tabular(key, info_backward[key])
                    logger.dump_tabular()
                if record_ep_inter is not None:
                    if self.episode % record_ep_inter == 0:
                        kvs = {
                            "s": s,
                            "a": a,
                            "s_": s_,
                            "r": r,
                            "tr": done,
                            "ep": self.episode,
                            "step": self.step,
                            "ep_step": ep_cycle
                        }
                        self.csvwritter.writekvs(kvs)
                ep_r += r
                ep_q += Q
                ep_l += loss
                if self.backward_ep_show_list:
                    for key in self.backward_ep_show_list:
                        ep_show[key] += info_backward[key]
                if done:
                    ep_reward.append(ep_r)
                    ep_Q_value.append(ep_q)
                    ep_loss.append(ep_l)
                    mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1)
                    if verbose == 2 and self.step > self.learning_starts:
                        logger.record_tabular("01.steps", self.step)
                        logger.record_tabular("02.episodes", self.episode)
                        logger.record_tabular("03.episode_reward",
                                              ep_reward[-1])
                        # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle)
                        logger.record_tabular("05.episode_loss", ep_l)
                        # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle)
                        # logger.record_tabular("07.episode_Q_value", ep_q)
                        logger.record_tabular("08.episode_Q_value_per_step",
                                              ep_q / ep_cycle)
                        # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward)
                        # logger.record_tabular("10.step_used", ep_cycle)
                        flag = 11
                        if self.forward_ep_show_list:
                            for key in self.forward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, info_forward[key])
                                flag += 1
                        if self.backward_ep_show_list:
                            for key in self.backward_ep_show_list:
                                logger.record_tabular(
                                    str(flag) + "." + key, ep_show[key])
                                flag += 1
                        logger.dump_tabular()
            if np.mean(ep_r) > now_best_reward:
                self.save_weights(self.path)
                print("the best mean ep reward is ", np.mean(ep_r),
                      "the weight is saved")
                now_best_reward = np.mean(ep_r)