def Imitation_Learning(self, step_time, data=None, policy=None, verbose=2): ''' :param data: the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} :param policy: :return: ''' if data is not None and policy is not None: raise Exception( "The IL only need one way to guide, Please make sure the input " ) if data is not None: for time in step_time: self.step += 1 loss = self.backward(data[time]) if verbose == 1: logger.record_tabular("steps", self.step) logger.record_tabular("loss", loss) logger.dumpkvs() if policy is not None: s = self.env.reset() for time in step_time: self.step += 1 a = policy(s) s_, r, done, info = self.env.step(a) sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} loss = self.backward(sample) s = s_ if verbose == 1: logger.record_tabular("steps", self.step) logger.record_tabular("loss", loss) logger.dumpkvs()
def training_with_policy(self, expert_policy, max_imitation_learning_step=1e5, max_ep_cycle=2000, buffer_size=32): self.step = 0 s = self.env.reset() loss_BC = 0 ep_step, ep_reward, ep_loss = 0, 0, 0 expert_action_set, policy_action_set = [], [] for _ in range(max_imitation_learning_step): self.step += 1 ep_step += 1 a_expert = expert_policy(s) a_policy = self.policy_network.forward(s) expert_action_set.append(torch.tensor(a_expert)) policy_action_set.append(a_policy) s_, r, done, info = self.env.step(a_policy) ep_reward += r sample = { "s": s, "a": a_policy, "a_expert": a_expert, "s_": s_, "r": r, "tr": done } s = s_[:] if len(policy_action_set) > buffer_size: loss = self.loss_cal(expert_action_set, policy_action_set) ep_loss += loss.cpu().detach().numpy() self.policy_model_optim.zero_grad() loss.backward() self.policy_model_optim.step() if done or ep_step > max_ep_cycle: ep_step = 0 logger.record_tabular("steps", self.step) logger.record_tabular("loss", ep_loss) logger.record_tabular("loss", ep_reward)
def interact(self, max_step=50000, max_ep_cycle=2000, train_rollout=10,learning_start=1000, render = False, verbose=1, record_ep_inter=None): ''' :param max_step: :param max_ep_time: :param max_ep_cycle: max step in per circle .........................show parameter.................................. :param verbose if verbose == 1 show every ep if verbose == 2 show every step :param record_ep_inter record_ep_interact data :return: None ''' # if IL_time is not None: self.render = render # .....................initially——recode...........................# rollout = 0 now_best_reward = -np.inf self.dist = make_pdtype(self.env.action_space, self.policy) sample_generate = self.runner(self.sample_rollout, self.sample_ep, max_ep_cycle, record_ep_inter, lstm_enable=self.lstm_enable) while self.step < max_step: sample = next(sample_generate) logger.record_tabular("01.step", self.step) logger.record_tabular("02.episode",self.episode) logger.record_tabular("03.rollout", rollout) logger.record_tabular("04.rollout/ep", sample["ep_used"]) logger.record_tabular("05.rollout/step", sum(sample["ep_step_used"])) logger.record_tabular("06.mean_episode_reward", np.mean(sample["ep_reward"])) logger.record_tabular("07.mean_step_reward", np.mean(sample["buffer"]["r"])) logger.record_tabular("08.mean_ep_step_used", np.mean(sample["ep_step_used"])) logger.dump_tabular() csv_record(sample["ep_reward"], self.path) record_sample = sample["buffer"] rollout += 1 if self.step > learning_start and self.learning: ep_show = {} if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] = 0 rollout_loss = 0 for time in range(train_rollout): loss, other_infor = self.update(record_sample) if verbose == 1: logger.record_tabular("06.train_rollout", time) logger.record_tabular("07.loss", loss) flag = 10 if self.backward_step_show_list: for key in self.backward_step_show_list: logger.record_tabular(str(flag) +"."+ key, other_infor[key]) flag += 1 logger.dump_tabular() rollout_loss += loss if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] += other_infor[key] if verbose == 2: logger.record_tabular("06.rollouts/loss", rollout_loss) logger.record_tabular("07.rollouts/episode_Q_value", torch.mean( torch.tensor(sample["ep_Q_value"])).cpu().detach().numpy()) # logger.record_tabular("05.episode_loss_per_step", rollout_loss / samole["step_used"]) # logger.record_tabular("06.episode_Q_value", sample["ep_Q_value"]) # logger.record_tabular("07.episode_Q_value_per_ep", np.mean(sample["ep_Q_value"])) flag = 10 if self.backward_ep_show_list: for key in self.backward_ep_show_list: logger.record_tabular(str(flag) + "." + key, ep_show[key]) flag += 1 logger.dump_tabular() if np.mean(sample["ep_reward"])>now_best_reward: self.save_weights(self.path) print("the best mean ep reward is ", np.mean(sample["ep_reward"]), "the weight is saved") now_best_reward = np.mean(sample["ep_reward"])
def Imitation_Learning(self, step_time, data=None, policy=None,learning_start=1000, buffer_size = 5000, value_training_round = 10, value_training_fre = 2500, verbose=2,render = False): ''' :param data: the data is a list, and each element is a dict with 5 keys s,a,r,s_,tr sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} :param policy: :return: ''' if data is not None and policy is not None: raise Exception("The IL only need one way to guide, Please make sure the input ") if data is not None: for time in step_time: self.step += 1 loss = self.backward(data[time]) if verbose == 1: logger.record_tabular("steps", self.step) logger.record_tabular("loss", loss) logger.dumpkvs() if policy is not None: buffer = ReplayMemory(buffer_size) s = self.env.reset() loss_BC = 0 ep_step,ep_reward = 0, 0 for _ in range(step_time): self.step += 1 ep_step += 1 a = policy(self.env) s_, r, done, info = self.env.step(a) #print(r,info) ep_reward += r if render: self.env.render() sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} buffer.push(sample) s = s_[:] if self.step > learning_start: sample_ = buffer.sample(self.batch_size) loss = self.policy_behavior_clone(sample_) if self.step % value_training_fre==0: record_sample = {} for key in buffer.memory.keys(): record_sample[key] = np.array(buffer.memory[key]).astype(np.float32)[-value_training_fre:] record_sample["value"] = self.value.forward(torch.from_numpy(record_sample["s"])) returns, advants = get_gae(record_sample["r"], record_sample["tr"], record_sample["value"], self.gamma, self.lam) record_sample["advs"] = advants record_sample["return"] = returns for round_ in range(value_training_round): loss_value = self.value_pretrain(record_sample, value_training_fre) print(round_, loss_value) if verbose == 1: logger.record_tabular("learning_steps", self.step) logger.record_tabular("loss", loss) logger.record_tabular("rewrad",r) logger.dumpkvs() loss_BC += loss if done: if verbose == 2: logger.record_tabular("learning_steps", self.step) logger.record_tabular("step_used", ep_step) logger.record_tabular("loss", loss_BC/ep_step) logger.record_tabular("ep_reward",ep_reward ) logger.dumpkvs() s = self.env.reset() loss_BC = 0 ep_step,ep_reward = 0, 0
def interact(self, max_step=50000, max_ep_cycle=2000, render=False, verbose=1, record_ep_inter=None): ''' :param max_step: :param max_ep_time: :param max_ep_cycle: max step in per circle .........................show parameter.................................. :param verbose if verbose == 1 show every ep if verbose == 2 show every step :param record_ep_inter record_ep_interact data :return: None ''' # if IL_time is not None: # .....................initially——recode...........................# ep_reward = [] ep_Q_value = [] ep_loss = [] now_best_reward = -np.inf while self.step < max_step: s = self.env.reset() 'reset the ep record' ep_r, ep_q, ep_l = 0, 0, 0 'reset the RL flag' ep_cycle, done = 0, 0 ep_show = {} if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] = 0 self.episode += 1 while done == 0 and ep_cycle < max_ep_cycle: self.step += 1 ep_cycle += 1 'the interaction part' a, Q, info_forward = self.forward(s) # print(a) s_, r, done, info = self.env.step(a) sample = {"s": s, "a": a, "s_": s_, "r": r, "tr": done} s = deepcopy(s_) loss, info_backward = self.backward(sample) if render: self.env.render() 'the record part' if verbose == 1 and self.step > self.learning_starts: logger.record_tabular("steps", self.step) logger.record_tabular("episodes", self.episode) logger.record_tabular("loss", loss) logger.record_tabular("reward", r) logger.record_tabular("Q", Q) if self.forward_step_show_list: for key in self.forward_step_show_list: logger.record_tabular(key, info_forward[key]) if self.backward_step_show_list: for key in self.backward_step_show_list: logger.record_tabular(key, info_backward[key]) logger.dump_tabular() if record_ep_inter is not None: if self.episode % record_ep_inter == 0: kvs = { "s": s, "a": a, "s_": s_, "r": r, "tr": done, "ep": self.episode, "step": self.step, "ep_step": ep_cycle } self.csvwritter.writekvs(kvs) ep_r += r ep_q += Q ep_l += loss if self.backward_ep_show_list: for key in self.backward_ep_show_list: ep_show[key] += info_backward[key] if done: ep_reward.append(ep_r) ep_Q_value.append(ep_q) ep_loss.append(ep_l) mean_100ep_reward = round(np.mean(ep_reward[-101:-1]), 1) if verbose == 2 and self.step > self.learning_starts: logger.record_tabular("01.steps", self.step) logger.record_tabular("02.episodes", self.episode) logger.record_tabular("03.episode_reward", ep_reward[-1]) # logger.record_tabular("04.episode_reward_per_step", ep_reward[-1] / ep_cycle) logger.record_tabular("05.episode_loss", ep_l) # logger.record_tabular("06.episode_loss_per_step", ep_l / ep_cycle) # logger.record_tabular("07.episode_Q_value", ep_q) logger.record_tabular("08.episode_Q_value_per_step", ep_q / ep_cycle) # logger.record_tabular("09.mean 100 episode reward", mean_100ep_reward) # logger.record_tabular("10.step_used", ep_cycle) flag = 11 if self.forward_ep_show_list: for key in self.forward_ep_show_list: logger.record_tabular( str(flag) + "." + key, info_forward[key]) flag += 1 if self.backward_ep_show_list: for key in self.backward_ep_show_list: logger.record_tabular( str(flag) + "." + key, ep_show[key]) flag += 1 logger.dump_tabular() if np.mean(ep_r) > now_best_reward: self.save_weights(self.path) print("the best mean ep reward is ", np.mean(ep_r), "the weight is saved") now_best_reward = np.mean(ep_r)