def episode(self, data_point, graph, eps, attr_encoder): self.policy.reset(eps) retrain_list = [] env = Env(data_point, graph, self.config, attr_encoder) iter_count = 0 while not env.is_finished(): # cannot figure out the clauses in limited step if (iter_count > cmd_args.episode_length): if cmd_args.reward_type == "only_success": final_reward = get_final_reward(env) if final_reward == -1: self.policy.reward_history = [0.0] * len( self.policy.reward_history) self.policy.reward_history.append(-1.0) else: self.policy.reward_history.append(1.0) self.policy.reward_history = torch.tensor( self.policy.reward_history) else: self.policy.reward_history.append(get_final_reward(env)) self.policy.prob_history = torch.cat( [self.policy.prob_history, torch.tensor([1.0])]) break iter_count += 1 env = self.policy(env) logging.info(self.policy.reward_history) logging.info(self.policy.prob_history) logging.info(env.clauses) return env, retrain_list
def episode(policy, data_point, graph, eps, attr_encoder, config, phase="train"): policy.reset(eps) retrain_list = [] env = Env(data_point, graph, config, attr_encoder) iter_count = 0 while not env.is_finished(): # cannot figure out the clauses in limited step if (iter_count > cmd_args.episode_length): if cmd_args.reward_type == "only_success": final_reward = get_final_reward(env) if final_reward == -1: policy.reward_history = [0.0] * len(policy.reward_history) policy.reward_history.append(-1.0) else: policy.reward_history.append(1.0) policy.reward_history = torch.tensor(policy.reward_history) else: policy.reward_history.append(get_final_reward(env)) policy.prob_history = torch.cat( [policy.prob_history, torch.tensor([1.0])]) break iter_count += 1 env = policy(env) if cmd_args.sub_loss: if not env.is_finished(): retrain_prob = (env.obj_poss_left[-2] - env.obj_poss_left[-1]) / env.obj_poss_left[0] retrain_prob = max(0.0, retrain_prob) decision = np.random.choice([0, 1], 1, p=[1 - retrain_prob, retrain_prob]) if decision[0] == 1: retrain_list.append( (deepcopy(env.data), deepcopy(env.clauses))) logging.info(policy.reward_history) logging.info(policy.prob_history) logging.info(env.clauses) return env, retrain_list
def episode(policy, target, data_point, graph, config, attr_encoder, memory, total_count, optimizer): env = Env(data_point, graph, config, attr_encoder) for iter_count in range(cmd_args.episode_length): state = env.get_state() action = select_action(policy, state) logging.info(f"selected clause: {env.actions[action]}") next_state, reward, done, _ = env.step(action) if done: next_state = None # cannot find out the result in limited steps if (iter_count == cmd_args.episode_length - 1): reward = get_final_reward(env) logging.info(f"reward: {reward}") memory.push(state, action, next_state, reward) optimize_model_DQ(memory, policy, target, optimizer) if done: break # if total_count % cmd_args.target_update == 0: target.load_state_dict(policy.state_dict()) if env.success: return True else: return False