def online_td_lambda(env, lamda, alpha, gamma, n_episodes):
    # Initialize value function.
    v = LinearValueFunction(env.n_states)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.n_states)
        z = np.zeros(env.n_states)
        V_old = 0

        while not done:
            obs_prime, reward, done = env.step()
            obs_prime_vec = encode_state(obs_prime, env.n_states)
            V = v.evaluate(obs_vec)
            V_prime = v.evaluate(obs_prime_vec)
            delta = reward + gamma * V_prime - V
            # Update eligibility traces.
            z = gamma * lamda * z + (
                1 - alpha * gamma * lamda * np.dot(z, obs_vec)) * obs_vec
            # Update weights.
            v.weights += alpha * (delta + V -
                                  V_old) * z - alpha * (V - V_old) * obs_vec
            V_old = V_prime
            obs_vec = obs_prime_vec
    return v
def one_step_actor_critic(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size*env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)
        I = 1

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                       env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            v.weights += alpha_w * I  * delta * obs_vec
            policy.weights += alpha_th  * I * delta * policy.eligibility_vector(a,
            sa_pairs)
            I *= I
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
示例#3
0
def actor_critic_eligibility_traces(env, eta, alpha_th, alpha_w, lambda_th, lambda_w, \
                                    gamma, n_episodes):

    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)
    z_th = np.zeros(env.observation_space_size * env.action_space_size)
    z_w = np.zeros(env.observation_space_size)
    R_bar = 0

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.observation_space_size)

        while not done:
            sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
                        env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(sa_pairs)
            obs_prime, reward, done = env.step(a)
            obs_prime_vec = encode_state(obs_prime, env.observation_space_size)
            delta = reward - R_bar + v.evaluate(obs_prime_vec) - v.evaluate(
                obs_vec)
            R_bar += eta * delta
            z_w = lambda_w * z_w + obs_vec
            z_th = lambda_th * z_th + policy.eligibility_vector(a, sa_pairs)
            v.weights += alpha_w * delta * z_w
            policy.weights += alpha_th * delta * z_th
            obs_vec = obs_prime_vec
            obs = obs_prime

        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return policy
示例#4
0
    def compute_expected_future_return(self, state, action, lookahead):
        state_hash = encode_state(state)
        state_action_occurrence = self.state_action_transition_count[
            state_hash][action]

        next_state_occurrence_dict = self.transitions[state_hash][action]
        state_probabilities = defaultdict(float)
        for next_state_hash in next_state_occurrence_dict:
            if state_action_occurrence < self.known_threshold:
                state_probabilities[next_state_hash] = 1
            else:
                count = next_state_occurrence_dict[next_state_hash]
                state_probabilities[next_state_hash] = (
                    count / state_action_occurrence)

        weighted_future_returns = list()
        for next_state_hash in state_probabilities:
            prev_action_weight = self.weights[state_hash][action]
            next_state = my_apply_action_to_state(state, action,
                                                  self.services.parser)
            weighted_future_returns.append(
                self.get_max_q_value(next_state, lookahead - 1,
                                     prev_action_weight) *
                state_probabilities[next_state_hash])

        return sum(weighted_future_returns)
示例#5
0
    def make_plan(self, state):
        curr_state = copy.deepcopy(state)
        if self.active_goal is None:
            self.active_goal = self.uncompleted_goals[0]

        problem = self.services.problem_generator.generate_problem(
            self.active_goal, curr_state)
        self.plan = self.services.planner(self.services.pddl.domain_path,
                                          problem)

        for i in range(len(self.plan)):
            action = self.plan[i]
            curr_state_hash = encode_state(curr_state)
            weight = float(i + 1) / len(self.plan)
            if self.weights[curr_state_hash][action.lower()] < weight:
                self.weights[curr_state_hash][action.lower()] = weight
            curr_state = my_apply_action_to_state(curr_state, action,
                                                  self.services.parser)

        local_weights = list()
        for state_hash in self.weights:
            vals = list(self.weights[state_hash].values())
            local_weights.extend(vals)
        self.state_recurrence_punish = median(local_weights)
        self.lookahead = min([4, int(len(self.plan) / 2)])
示例#6
0
    def __init__(self, cargos, trucks, warehouses, initial: FluentState,
                 goal: list):
        self.state_map = initial.pos + initial.neg
        self.initial_state_TF = encode_state(initial, self.state_map)

        Problem.__init__(self, self.initial_state_TF, goal=goal)
        self.cargos = cargos
        self.trucks = trucks
        self.actions_list = self.get_actions()
示例#7
0
    def store_paths(self, paths):
        i1, i2 = itertools.tee(itertools.chain.from_iterable(paths))
        next(i2)
        for (__, s), (a, ns) in zip(i1, i2):
            if a is None:
                continue
            encode_state(self.states[self.index, :, :, :], s)
            self.actions[self.index, :, :] = 0.0
            self.actions[self.index, a.x, a.y] = 1.0
            self.rewards[self.index] = ns.reward
            encode_state(self.nstates[self.index, :, :, :], ns)
            if ns.status != helicopter3x3.Status.flying:
                self.done[self.index] = 1.0
            else:
                self.done[self.index] = 0.0

            self.index += 1
            if self.index >= self.size: self.index = 0
            if self.index > self.maxSize: self.maxSize = self.index
示例#8
0
 def get_reward(self, state, action):
     state_hash = encode_state(state)
     if self.state_action_rewards_count[state_hash][
             action] >= self.known_threshold:
         state_action_rewards = self.rewards[state_hash][action]
         reward = float(
             sum(state_action_rewards)) / len(state_action_rewards)
     else:
         reward = self.weights[state_hash][action]
     return reward
示例#9
0
def semi_gradient_td_lambda(env, lamda, alpha, gamma, n_episodes):
    # Initialize value function.
    v = LinearValueFunction(env.n_states)

    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        obs_vec = encode_state(obs, env.n_states)
        z = np.zeros(env.n_states)

        while not done:
            obs_prime, reward, done = env.step()
            obs_prime_vec = encode_state(obs_prime, env.n_states)
            # Update eligibility traces.
            z = gamma * lamda * z  + obs_vec
            delta = reward + gamma * v.evaluate(obs_prime_vec) - v.evaluate(obs_vec)
            # Update weights.
            v.weights += alpha * delta * z
            obs_vec = obs_prime_vec
    return v
示例#10
0
    def get_moves_with_rewards(self):
        valid_moves = self.get_all_moves()

        valid_moves_idx = map(get_move_idx, valid_moves)

        best_move = None
        inp = encode_state(self.board.values).float()
        pred = self.model(inp)

        valid_moves_prob = [[valid_moves[i], pred[idx].item()]
                            for i, idx in enumerate(valid_moves_idx)]

        return valid_moves_prob
示例#11
0
 def result(self, state: str, action: Action):
     new_state = FluentState([], [])
     old_state = decode_state(state, self.state_map)
     for fluent in old_state.pos:
         if fluent not in action.effect_rem:
             new_state.pos.append(fluent)
     for fluent in action.effect_add:
         if fluent not in new_state.pos:
             new_state.pos.append(fluent)
     for fluent in old_state.neg:
         if fluent not in action.effect_add:
             new_state.neg.append(fluent)
     for fluent in action.effect_rem:
         if fluent not in new_state.neg:
             new_state.neg.append(fluent)
     return encode_state(new_state, self.state_map)
示例#12
0
    def prepare_data(self):
        self.data = []
        total = 0
        for exp in self.experience[-1::-1]:
            state, move, reward = exp

            vector = encode_state(state).flatten().tolist()

            move_idx = get_move_idx(move)
            vector.append(move_idx)

            total = reward + self.discount * total
            vector.append(total)

            self.data.append(vector)

        self.data.reverse()
def REINFORCE_baseline(env, alpha_th, alpha_w, gamma, n_episodes):
    policy = ExponentialSoftmax(env.observation_space_size *
                                env.action_space_size)
    v = LinearValueFunction(env.observation_space_size)

    returns = []
    for episode in range(n_episodes):
        done = False
        obs = env.reset()
        all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
        env.action_space_size) for a in range(env.action_space_size)]
        a = policy.sample_action(all_sa_pairs)
        states = [obs]
        actions = [a]
        rewards = [None]

        while not done:
            obs, reward, done = env.step(a)
            all_sa_pairs = [encode_sa_pair(obs, a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            a = policy.sample_action(all_sa_pairs)
            states.append(obs)
            actions.append(a)
            rewards.append(reward)

        for t in range(len(states)):
            G_t = sum(rewards[t + 1:])
            x_t = encode_state(states[t], env.observation_space_size)
            delta = G_t - v.evaluate(x_t)
            v.weights += alpha_w * (gamma**t) * delta * x_t
            all_sa_pairs = [encode_sa_pair(states[t], a, env.observation_space_size, \
            env.action_space_size) for a in range(env.action_space_size)]
            policy.weights += alpha_th * (gamma ** t) * G_t * delta * \
                              policy.eligibility_vector(actions[t], all_sa_pairs)

        returns.append(sum(rewards[1:]))
        print_episode(episode, n_episodes)
    print_episode(n_episodes, n_episodes)
    return (policy, np.array(returns))
示例#14
0
    def compute_max_qval_action_pair(self, state, lookahead,
                                     prev_action_weight):
        state_hash = encode_state(state)
        predicted_returns = defaultdict(float)
        actions = self.valid_actions_getter.get(state)
        for action in actions:
            # expansion...
            edge_weight = prev_action_weight * self.off_plan_punish_factor
            if self.weights[state_hash][action] < edge_weight:
                self.weights[state_hash][action] = edge_weight

        for action in actions:
            q_s_a = self.get_q_value(state, action, lookahead)
            predicted_returns[action] = q_s_a

        max_q_val = max(predicted_returns.values())
        best_actions = list()
        for action_name in predicted_returns:
            if predicted_returns[action_name] == max_q_val:
                best_actions.append(action_name)

        best_action = random.choice(best_actions)

        return max_q_val, best_action
示例#15
0
    for region in policy.keys():
        policy[region] = [a / sum(policy[region]) for a in policy[region]]

    return policy


# Main function runs the test using a stored NN and outputs a CSV file
if __name__ == "__main__":

    r = sys.argv[1]
    n = int(sys.argv[2])

    states = generate_all()
    inputs = np.zeros((512 * 3 * 3, 3, 3, 2))
    for s, state in enumerate(states):
        encode_state(inputs[s], state)

    for i in range(n):

        nn = tf.keras.models.load_model("nets/NN_{0}_{1}.h5".format(r, i),
                                        custom_objects={'tf': tf})
        policy = policy_test(nn, states, inputs)
        print("Policy {} {} evaluated!".format(r, i))

        with open("policy_dists/policy_{0}_{1}.csv".format(r, i), "w") as f:

            for region in policy.keys():
                f.write("".join([str(x) for x in region]) + ", " +
                        ", ".join([str(y) for y in policy[region]]) + "\n")

    print("Done!")
示例#16
0
    def next_action(self):
        # perception
        state = self.services.perception.get_state()
        state_hash = encode_state(state)

        # remember
        self.update(self.prev_state_hash, self.prev_action, state_hash)

        # check if done
        self.check_goals(state)
        if len(self.uncompleted_goals) == 0:
            save_obj(self.transitions, self.env_name + "_transitions")
            save_obj(self.state_action_transition_count,
                     self.env_name + "_state_action_transition_count")
            return None

        # choose
        if self.plan is not None:
            if self.prev_action.upper() not in self.plan and \
                                self.weights[self.prev_state_hash][self.prev_action] <= self.last_in_plan_transition_weight * self.off_plan_punish_factor ** self.lookahead:
                self.plan = None

        if self.plan is not None:
            action = self.choose(state)

            self.prev_action = action
            self.prev_state_hash = state_hash
            return self.prev_action

        applicable_actions = self.valid_actions_getter.get(state)
        possible_next_states = defaultdict(None)
        for applicable_action in applicable_actions:
            next_state = my_apply_action_to_state(state, applicable_action,
                                                  self.services.parser)
            possible_next_states[applicable_action] = encode_state(next_state)

        actions_leading_to_not_seen_states = filter(
            lambda action_key: possible_next_states[action_key] not in self.
            visited_states_hash, possible_next_states)

        if len(actions_leading_to_not_seen_states) == 0:
            self.prev_state_hash = None
            self.prev_action = None
            self.visited_states_hash = set()
            self.plan = None
            return self.next_action()

        if len(actions_leading_to_not_seen_states) == 1:
            self.prev_state_hash = state_hash
            self.prev_action = actions_leading_to_not_seen_states.pop(0)
            return self.prev_action

        if self.plan is None:
            self.make_plan(state)

            action = self.choose(state)

            self.prev_state_hash = state_hash
            self.prev_action = action
            return self.prev_action

        return None