예제 #1
0
    def episode(self, data_point, graph, eps, attr_encoder):
        self.policy.reset(eps)
        retrain_list = []

        env = Env(data_point, graph, self.config, attr_encoder)
        iter_count = 0

        while not env.is_finished():
            # cannot figure out the clauses in limited step
            if (iter_count > cmd_args.episode_length):
                if cmd_args.reward_type == "only_success":
                    final_reward = get_final_reward(env)
                    if final_reward == -1:
                        self.policy.reward_history = [0.0] * len(
                            self.policy.reward_history)
                        self.policy.reward_history.append(-1.0)
                    else:
                        self.policy.reward_history.append(1.0)
                    self.policy.reward_history = torch.tensor(
                        self.policy.reward_history)
                else:
                    self.policy.reward_history.append(get_final_reward(env))
                    self.policy.prob_history = torch.cat(
                        [self.policy.prob_history,
                         torch.tensor([1.0])])
                break

            iter_count += 1
            env = self.policy(env)

        logging.info(self.policy.reward_history)
        logging.info(self.policy.prob_history)
        logging.info(env.clauses)
        return env, retrain_list
def episode(policy,
            data_point,
            graph,
            eps,
            attr_encoder,
            config,
            phase="train"):
    policy.reset(eps)
    retrain_list = []

    env = Env(data_point, graph, config, attr_encoder)
    iter_count = 0

    while not env.is_finished():
        # cannot figure out the clauses in limited step
        if (iter_count > cmd_args.episode_length):
            if cmd_args.reward_type == "only_success":
                final_reward = get_final_reward(env)
                if final_reward == -1:
                    policy.reward_history = [0.0] * len(policy.reward_history)
                    policy.reward_history.append(-1.0)
                else:
                    policy.reward_history.append(1.0)
                policy.reward_history = torch.tensor(policy.reward_history)
            else:
                policy.reward_history.append(get_final_reward(env))
                policy.prob_history = torch.cat(
                    [policy.prob_history,
                     torch.tensor([1.0])])
            break

        iter_count += 1
        env = policy(env)

        if cmd_args.sub_loss:
            if not env.is_finished():
                retrain_prob = (env.obj_poss_left[-2] -
                                env.obj_poss_left[-1]) / env.obj_poss_left[0]
                retrain_prob = max(0.0, retrain_prob)
                decision = np.random.choice([0, 1],
                                            1,
                                            p=[1 - retrain_prob, retrain_prob])
                if decision[0] == 1:
                    retrain_list.append(
                        (deepcopy(env.data), deepcopy(env.clauses)))

    logging.info(policy.reward_history)
    logging.info(policy.prob_history)
    logging.info(env.clauses)
    return env, retrain_list
예제 #3
0
def episode(policy, target, data_point, graph, config, attr_encoder, memory,
            total_count, optimizer):
    env = Env(data_point, graph, config, attr_encoder)
    for iter_count in range(cmd_args.episode_length):

        state = env.get_state()
        action = select_action(policy, state)

        logging.info(f"selected clause: {env.actions[action]}")
        next_state, reward, done, _ = env.step(action)

        if done:
            next_state = None

        # cannot find out the result in limited steps
        if (iter_count == cmd_args.episode_length - 1):
            reward = get_final_reward(env)

        logging.info(f"reward: {reward}")
        memory.push(state, action, next_state, reward)
        optimize_model_DQ(memory, policy, target, optimizer)

        if done:
            break  #

        if total_count % cmd_args.target_update == 0:
            target.load_state_dict(policy.state_dict())

    if env.success:
        return True
    else:
        return False