def get_episode(mdp, start, episode_length, policy):
    trace = []
    rewards = []
    actions = []
    current_state = start
    for j in range(episode_length):
        trace.append(current_state)
        next_action = policy(current_state)
        actions.append(next_action)
        current_reward = mdp.next_reward(current_state, next_action)
        rewards.append(current_reward)
        current_state = mdp.next_state(current_state, policy(current_state))
    return trace, rewards, actions
示例#2
0
    def sample_episode(
        self,
        policy: Policy,
        start_state: Optional[_State] = None,
        max_len: Optional[int] = None
    ) -> List[Tuple[_State, _Action, float, float]]:
        """

        :param policy:
        :param start_state:
        :param max_len:
        :return: trajectory wherein each member is (state, action, reward, action-probability)
        """
        self.reset(start_state)
        idx = 1
        trajectory = []
        while (max_len is None) or (idx <= max_len):
            idx += 1
            state = self.get_state()
            weighted_action = policy(
                self.censor_state(state)).sample_weighted()
            action = weighted_action.value
            reward, new_state, terminal = self.sample(action)
            trajectory.append((self.censor_state(state), action, reward,
                               weighted_action.weight))
            if terminal:
                break
        return trajectory
示例#3
0
def episode(env, policy):
    state = env.reset()
    state, reward, done = env.check_after_init()
    while (not done):
        action = policy(state)
        state, reward, done = env.step(action)

    return reward
示例#4
0
 def apply_policy(self, policy: Policy) -> float:
     if not self.system.is_terminal(self.state):
         action = policy(self.state).sample()
         reward, new_state = self.sample_result(self.state, action)
         self.state = new_state
         return reward
     else:
         return 0.
示例#5
0
    def optimize_policy(
            self,
            initial: Policy,
            eval_threshold: float = 0.01,
            use_value_update: bool = False,
            greedy_prob: float = 0.5,
            action_stability_margin: float = 0.0001,
            round_v: Optional[int] = None
    ) -> Tuple[Policy, Dict[_State, float]]:
        if use_value_update:
            V, policy = self.evaluate_policy(
                initial.clone(),
                threshold=eval_threshold,
                greedy_prob=greedy_prob,
                action_stability_margin=action_stability_margin,
                round_v=round_v)
        else:
            stable = False
            policy = initial.clone()
            iteration = 1
            while not stable:
                print("Commencing policy optimization iteration: {}".format(
                    iteration))
                changed_actions = 0
                V, _ = self.evaluate_policy(policy,
                                            threshold=eval_threshold,
                                            round_v=round_v)
                stable = True
                for state in self.system.states:
                    old_action = policy(state).sample(
                    )  # sampling from deterministic policy is deterministic
                    best_val = float('-inf')
                    best_action = None
                    for action in self.system.actions:
                        action_reward = float('-inf')
                        outcomes = self.system.dynamics.get((state, action))
                        if outcomes is not None:
                            action_reward = expectation(
                                self.system.dynamics.get((state, action)),
                                lambda outcome: outcome[0] + self.discount * V[
                                    outcome[1]])
                        if action_reward > best_val + action_stability_margin:
                            best_val = action_reward
                            best_action = action
                    if best_action != old_action:
                        stable = False
                        policy.update(state, best_action)
                        changed_actions += 1
                print("{} actions changed after iteration {}".format(
                    changed_actions, iteration))
                iteration += 1

        return policy, V
示例#6
0
    def td_update_tabular(self,
                          v_table,
                          num_e=100,
                          discount_factor=0.9,
                          alpha=0.05,
                          mode="training",
                          state_trans=False):
        env = self.env
        policy = self.policy
        #v_table = np.zeros(self.state_n)
        td_errors = []
        for i_episode in range(num_e):
            state = env.reset()
            not_done = True

            while not_done:

                action = policy(self.action_space, state, v_table)

                # Take one step based on the action choosing:
                next_state, reward, done = env.step(action)

                if not state_trans:
                    x, y = next_state
                    next_state2 = env.state_transform(x, y)
                    x, y = state
                    state2 = env.state_transform(x, y)

                # TD update
                target = reward + discount_factor * v_table[next_state2]

                if reward > 1 or reward < 0:
                    nihao = 1

                #print ("reward", reward, "state: ",  state, "next state", next_state)
                td_error = target - v_table[state2]

                if mode == "training":
                    v_table[state2] += alpha * td_error
                    #print("state v", v_table[state2])
                elif mode == "testing":
                    td_errors.append(td_error)
                state = next_state

                if done:
                    break
        result = 0
        if mode == "training":
            result = v_table
        elif mode == "testing":
            result = td_errors

        return result
示例#7
0
 def simulate(self, policy, id=None):
     states, rewards, actions, next_states, terminates = [], [], [], [], []
     s = self.env.reset(id)
     terminate = False
     while not terminate:
         a = policy(s.unsqueeze(0)).view(-1)
         next_state, r, terminate = self.env.step(a)
         states.append(s)
         rewards.append(r)
         actions.append(onehots(a, self.env.w))
         next_states.append(next_state)
         terminates.append(terminate)
         s = next_state
     return states, rewards, actions, next_states, terminates
示例#8
0
def monte_carlo(env, policy, first_visit, num_episodes):
    # size of v = (rawsum, number of distinct trumps, dealer's hand)
    v = np.zeros((61, 4, 10), dtype=float)
    num_updates = np.zeros((61, 4, 10), dtype=float)

    for _ in tqdm(range(num_episodes)):
        # print("====================== NEW EPISODE ======================")
        states = []
        state = env.reset()
        state, reward, done = env.check_after_init()
        if done:
            # no actionable state encountered in this episode so no update
            continue
        states.append(copy(state))
        while (not done):
            action = policy(state)
            state, reward, done = env.step(action)
            states.append(copy(state))

        if states[-1] != None:
            raise Exception("last state in episode is actionable, CHECK")
        states = states[:-1]

        for s in states:
            if s.category == "BUST" or s.category == "SUM31":
                raise Exception("states within an episode are not actionable")

        # updating value function
        if first_visit:
            states = list(set(states))
        for state in states:
            transformed_state = state_transformation(state)
            v[transformed_state] += reward
            num_updates[transformed_state] += 1

    v = v / (
        num_updates + 1e-5
    )  # not replacing nan with zeros to know which states were not updated

    return v
示例#9
0
    def evaluate_policy(self,
                        policy: Policy,
                        threshold: float = 0.01,
                        greedy_prob: float = 0.,
                        action_stability_margin: float = 0.0001,
                        round_v: Optional[int] = None) -> Dict[_State, float]:
        if self.V is None:
            self.V = {state: 0. for state in self.system.states}
        max_error = threshold * 2
        iteration = 1
        policy_stable = (greedy_prob == 0.)
        while (max_error > threshold) or not policy_stable:
            stablize_policy = (max_error <= threshold)
            if stablize_policy:
                print("Stabilizing policy")
            policy_changed = False
            max_error = 0.
            for state in self.system.states:
                if not self.system.is_terminal(state):
                    action_dist = policy(state)
                    greedy = stablize_policy or (random.random() < greedy_prob)
                    if greedy:
                        best_action = None
                        best_reward = float('-inf')
                        for action in self.system.actions:
                            action_reward = expectation(
                                self.system.dynamics.get((state, action), []),
                                lambda outcome: outcome[
                                    0] + self.discount * self.V[outcome[1]])
                            if action_reward > best_reward + action_stability_margin:
                                best_reward = action_reward
                                best_action = action
                        v = best_reward
                        first = next(iter(action_dist))
                        if (first.weight < 1.) or (first.value != best_action):
                            policy.update(state, best_action)
                            policy_changed = True
                    else:
                        v = 0.
                        for action in action_dist:
                            action_reward = expectation(
                                self.system.dynamics.get((state, action.value),
                                                         []),
                                lambda outcome: outcome[
                                    0] + self.discount * self.V[outcome[1]])
                            pi_action = action.weight
                            v += pi_action * action_reward
                else:
                    v = 0.
                error = abs(v - self.V[state])
                if error > max_error:
                    max_error = error
                self.V[state] = v if round_v is None else (round(v, round_v))
            print("After {} iterations max_error is {}".format(
                iteration, max_error))
            iteration += 1

            if stablize_policy and not policy_changed:
                policy_stable = True
                print("Policy is stable")
        return self.V, policy
示例#10
0
from render import *

NUM_AGENTS = 4
ENV = 'env_0'
loc_dict = {0: loc(1, 1), 1: loc(5, 5), 2: loc(5, 1), 3: loc(1, 5)}
DIM = (7, 7)
SCALE = 25

# setup env
e = Env(NUM_AGENTS, ENV, DIM, _loc_dict=loc_dict, _obs_type='one_hot')
# initialize pygame
window = window(SCALE, (e.height, e.width))

policies = {}
for i in range(0, e.num_agents):
    policies[i] = policy(_e0=1)

for n in range(0, 10000):
    for event in pygame.event.get():
        if (event.type == pygame.QUIT):
            quit()
    window.render(e, 0.5)
    # rgb = pygame.surfarray.array3d(screen)
    action_list = []
    for i in range(0, e.num_agents):
        action_space = e.action_space(i)
        opt_action = policies[i].action(action_space)
        action_list.append(opt_action)
        print(action_dict[opt_action])
    observation, reward, done = e.step(action_list)
    for i in range(0, e.num_agents):
示例#11
0
    def run_one_tabular(self,discount_factor = 0.9):
        # regular q_learning update
        env = self.env
        policy = self.policy
        D = []
        Return = []
        Trajectory = []

        q_table = self.policy_table
        policy = policy(q_table)

        for i_episode in range(self.num_e):
            returns = 0.000
            state = env.reset()
            trajectory = []
            for turn in itertools.count():

                # Get the action properties for each one

                x, y = state
                state2 = env.state_transform(x, y)

                action_p = policy(state2)
                # Choose the action
                action_index = 1
                try:
                    action_index = np.random.choice(range(self.action_n), p=action_p)

                except:
                    print("error")

                action = self.action_space[action_index]
                trajectory.append([state2, action_index])

                # Take one step based on the action
                next_state, reward, done = env.step(action)

                returns = returns + discount_factor ** turn * reward
                x, y = next_state
                next_state2 = env.state_transform(x, y)

                if done:
                    Return.append(returns)
                    Trajectory.append(trajectory)
                    #result.append([trajectory, returns])
                    break

                state = next_state

        #split result into training and testing
        n_D = len(Trajectory)

        testing_index = int(n_D/self.RATIO)
        training_r = Return[:testing_index]
        testing_r = Return[testing_index:]
        training = Trajectory[:testing_index]
        testing = Trajectory[testing_index:]


        D_training = [training,training_r]
        D_testing = [testing,testing_r]


        best_safty_policy = self.quasi_seldonian(0.5, D_training, D_testing, len(D_testing[0]), self.delta)
        return best_safty_policy, np.mean(Return)
示例#12
0
    def run_Tabular(self,
                    alpha=0.005,
                    beta=0.0001,
                    discount_factor=0.9,
                    decay=True):
        # regular q_learning update
        env = self.env
        policy = self.policy
        result = []

        v_table = np.zeros((self.state_n, 1))  # critic
        p_table = np.zeros((self.state_n, self.action_n))  # actor
        policy = policy(p_table, 1, self.action_n, ifcontinue=False)

        for i_episode in range(self.num_e):
            returns = 0.000
            state = env.reset()
            not_done = True

            if decay and i_episode >= 80:
                epsilon = 1.0 / (i_episode + 1)**2
                epsilon = 0
                policy = self.policy(p_table,
                                     0,
                                     self.action_n,
                                     ifcontinue=False)
                decay = False

            episodes = []
            for turn in itertools.count():

                # Get the action properties for each one

                x, y = state
                state2 = env.state_transform(x, y)

                action_p = policy(state2)
                # Choose the action
                action_index = np.random.choice(range(self.action_n),
                                                p=action_p)
                action = self.action_space[action_index]

                # Take one step based on the action
                next_state, reward, done = env.step(action)

                returns = returns + discount_factor**turn * reward
                x, y = next_state
                next_state2 = env.state_transform(x, y)

                episodes.append((state2, action_index, reward))

                if done:
                    #print("one episode", returns)
                    result.append(returns)
                    break

                state = next_state

            last_reward = 0
            update = 0

            for i in range(len(episodes)):

                # Get the action properties for each one
                state2, action_index, reward = episodes[i]

                if i + 1 < len(episodes):
                    next_state2, next_action_index, _ = episodes[i + 1]
                    v_next_state = v_table[next_state2]

                else:
                    v_next_state = 0
                # Update the q_function
                #target = reward + discount_factor * v_next_state
                #td_error = target - v_table[state2]
                if i == 0:
                    update = returns
                else:
                    update = (update -
                              last_reward) / discount_factor - v_table[state2]
                    #update = (update - discount_factor**(turn-1)*last_reward) - v_table[state2]
                # Update
                p_table[state2][action_index] += alpha * update
                #v_table[state2] += beta * td_error
                last_reward = reward

        return result
示例#13
0
    def run(self,
            alpha=0.005,
            discount_factor=1,
            lambda2=0,
            epsilon=0.05,
            decay=False):

        results = []
        normalized = self.normalized
        featurized = self.featurized

        #todo: didn't write "decay" case
        env = self.env
        policy = self.policy

        critic = Util.Linear_Approximator(normalized,
                                          featurized,
                                          lambda2=lambda2,
                                          n_feature=self.n_featurized,
                                          alpha=alpha,
                                          action_n=self.action_n)
        critic.initial()

        policy = policy(critic.perdict,
                        epsilon,
                        self.action_n,
                        ifcontinue=True)
        for i_e in range(self.num_e):
            state = env.reset()
            state2 = normalized(state)
            phi_state = featurized(state2)
            returns = 0

            action_p = policy(phi_state)
            action_index = np.random.choice(range(self.action_n), p=action_p)
            action = self.action_space[action_index]

            for turn in itertools.count():
                next_state, reward, done = env.step(action)
                next_state2 = normalized(next_state)
                phi_next_state = featurized(next_state2)

                action_p = policy(phi_next_state)
                next_action_index = np.random.choice(range(self.action_n),
                                                     p=action_p)
                next_action = self.action_space[next_action_index]

                #Caculate Return
                returns = returns + discount_factor**turn * reward

                if done:
                    target = reward + discount_factor * 0
                    critic.update(phi_state, target, action_index)
                    results.append(returns)
                    break
                if turn == 1000:
                    results.append(returns)
                    break

        # Function approximation TD_update
                q_next_state = critic.perdict(phi_next_state, a=-1)
                target = reward + discount_factor * q_next_state[
                    next_action_index]
                q_phi_sate = critic.perdict(phi_state, action_index)
                td_error = target - q_phi_sate
                critic.update(phi_state, target, action_index)

                state = next_state
                action = next_action
                action_index = next_action_index
                phi_state = phi_next_state

        #print("q")
        #print(np.mean(results), "max", np.max(results))
        return results
示例#14
0
    def td_update_continue(self,
                           weight,
                           num_e=100,
                           discount_factor=1,
                           alpha=0.05,
                           mode="training",
                           degree=3):
        env = self.env
        policy = self.policy
        a_n = self.action_n

        #n_out_features = Util.get_n_features(degree)
        # weight = np.zeros(n_out_features)

        v_w = lambda x: weight.dot(x)
        dv_w = lambda x: x
        #featurized = lambda x: Util.Fourier_Kernel(x, degree)
        featurized = self.featurized

        td_errors = []

        for i_episode in range(num_e):
            state = env.reset()
            state2 = env.normalize(state)
            phi_state = featurized([state2])
            phi_state = phi_state[0]
            not_done = True

            while not_done:
                action = policy(self.action_space, phi_state, v_w)
                next_state, reward, done = env.step(action)

                next_state2 = env.normalize(next_state)
                #phi_next_state = featurized([next_state2])
                phi_next_state = featurized(next_state2)
                phi_next_state = phi_next_state[0]

                # Function approximation TD_update
                try:
                    v_phi_next_state = v_w(phi_next_state)
                    target = reward + discount_factor * v_w(phi_next_state)
                except:
                    print("error")
                v_phi_sate = v_w(phi_state)
                td_error = target - v_w(phi_state)

                if mode == "training":
                    weight += alpha * td_error * dv_w(phi_state)
                    try:
                        a = weight.dot(phi_state)
                    except:
                        print("error")

                    v_w = lambda x: weight.dot(x)
                    dv_w = lambda x: x

                elif mode == "testing":
                    td_errors.append(td_error)

                state = next_state
                phi_state = phi_next_state
                if done:
                    break

        result = 0

        if mode == "training":
            result = weight
        elif mode == "testing":
            result = td_errors
        return result
示例#15
0
    def rollout(self,
                policy: policy.BasePolicy,
                max_step: float = 100,
                frame_skip: int = 0,
                gamma: float = 1.0):

        self.race.restart()
        self.race.step(pystk.Action())
        self.track.update()

        result = list()

        state = pystk.WorldState()
        state.update()

        # r_total?
        r_total = 0

        # distance
        d = state.karts[0].distance_down_track

        # s
        s = np.array(self.race.render_data[0].image)

        off_track = deque(maxlen=20)
        traveled = deque(maxlen=50)

        for it in range(max_step):
            # Early termination.
            if it > 20 and (np.median(traveled) < 0.05 or all(off_track)):
                break

            velocity = np.linalg.norm(state.karts[0].velocity)
            action, action_index, p_action = policy(s, velocity)

            if isinstance(action, pystk.Action):
                action_raw = [action.steer, action.acceleration, action.drift]
            else:
                action_raw = action

                action = pystk.Action()
                action.steer = action_raw[0]
                action.acceleration = np.clip(action_raw[1] - velocity, 0,
                                              np.inf)
                action.drift = action_raw[2] > 0.5

            for _ in range(1 + frame_skip):
                self.race.step(action)
                self.track.update()

                state = pystk.WorldState()
                state.update()

            s_p = np.array(self.race.render_data[0].image)

            d_new = min(state.karts[0].distance_down_track, d + 5.0)
            node_idx = np.searchsorted(self.track.path_distance[:, 1], d_new %
                                       self.track.path_distance[-1, 1]) % len(
                                           self.track.path_nodes)
            a_b = self.track.path_nodes[node_idx]

            distance = point_from_line(state.karts[0].location, a_b[0], a_b[1])
            distance_traveled = get_distance(d_new, d,
                                             self.track.path_distance[-1, 1])
            gain = distance_traveled if distance_traveled > 0 else 0
            mult = int(distance < 6.0)

            traveled.append(gain)
            off_track.append(distance > 6.0)

            r_total = max(r_total, d_new * mult)
            r = np.clip(0.5 * max(mult * gain, 0) + 0.5 * mult, -1.0, 1.0)

            result.append(
                Data(s.copy(), np.float32(action_raw),
                     np.uint8([action_index]), np.float32([p_action]),
                     np.float32([r]), s_p.copy(), np.float32([np.nan]),
                     np.float32([0])))

            d = d_new
            s = s_p

        G = 0

        # Ugly.
        for i, data in enumerate(reversed(result)):
            G = data.r + gamma * G
            result[-(i + 1)] = Data(data.s, data.a, data.a_i, data.p_a, data.r,
                                    data.sp, np.float32([G]),
                                    np.float32([i == 0]))

        # HACK PLEASE REMEMBER THIS
        return result[4:], r_total / self.track.path_distance[-1, 1]
示例#16
0
    def sarsa_continue(self,
                       weight,
                       num_e=100,
                       discount_factor=1,
                       alpha=0.05,
                       lambda2=0,
                       epsilon=0.05,
                       mode="training",
                       step_limit=1008,
                       decay=False,
                       more=""):
        env = self.env
        policy = self.policy
        a_n = self.action_n
        td_errors = []
        results = []
        q_w = lambda x: weight.dot(x)
        dq_w = lambda x: x
        featurized = self.featurized

        if lambda2 <= 0:
            policy = policy(q_w, epsilon, self.action_n, ifcontinue=True)
            for i_episode in range(num_e):
                state = env.reset()
                state2 = env.normalize(state)
                phi_state = featurized(state2)

                not_done = True
                turn = 0
                returns = 0

                #choose the first action
                action_p = policy(phi_state)
                # Choose the action
                action_index = np.random.choice(range(self.action_n),
                                                p=action_p)
                action = self.action_space[action_index]
                if decay and i_episode == 80:
                    # stop exploration
                    policy = self.policy
                    epsilon = 0
                    policy = policy(q_w,
                                    epsilon,
                                    self.action_n,
                                    ifcontinue=True)
                    decay = False

                while not_done:

                    next_state, reward, done = env.step(action)
                    returns = returns + discount_factor**turn * reward

                    next_state2 = env.normalize(next_state)
                    phi_next_state = featurized(next_state2)

                    # choose the first action
                    action_p = policy(phi_next_state)
                    # Choose the action

                    next_action_index = np.random.choice(range(self.action_n),
                                                         p=action_p)
                    next_action = self.action_space[next_action_index]

                    if done:
                        q_phi_sate = q_w(phi_state)[action_index]
                        q_phi_next_state = q_w(phi_next_state)
                        if turn > step_limit:
                            target = reward + discount_factor * q_phi_next_state[
                                next_action_index]
                        else:
                            target = reward + discount_factor * 0
                        td_error = target - q_phi_sate
                        temp = alpha * td_error * dq_w(phi_state)
                        weight[action_index] += temp
                        q_w = lambda x: weight.dot(x)
                        dq_w = lambda x: x

                        results.append(returns)
                        break

                    # Function approximation TD_update
                    q_phi_next_state = q_w(phi_next_state)
                    target = reward + discount_factor * q_phi_next_state[
                        next_action_index]
                    q_phi_sate = q_w(phi_state)[action_index]
                    td_error = target - q_phi_sate

                    if mode == "training":
                        temp = alpha * td_error * dq_w(phi_state)

                        a = weight.dot(phi_state)
                        weight[action_index] += temp

                        q_w = lambda x: weight.dot(x)
                        b = q_w(phi_state)
                        dq_w = lambda x: x

                    elif mode == "testing":
                        td_errors.append(td_error)

                    state = next_state
                    phi_state = phi_next_state
                    action = next_action
                    action_index = next_action_index
                    turn += 1

        print("sarsa")
        print(np.mean(results), "max", np.max(results))

        return results
    def run_Tabular(self,
                    alpha=0.005,
                    beta=0.0001,
                    discount_factor=0.9,
                    decay=True,
                    lambda2=0.8):
        # regular q_learning update
        env = self.env
        policy = self.policy
        result = []

        v_table = np.zeros((self.state_n, 1))  # critic
        p_table = np.zeros((self.state_n, self.action_n))  # actor
        policy = policy(p_table, 1, self.action_n, ifcontinue=False)

        e_table_v = np.zeros((self.state_n, 1))
        e_table_p = np.zeros((self.state_n, self.action_n))

        for i_episode in range(self.num_e):
            returns = 0.000
            turn = 0
            state = env.reset()
            not_done = True

            if decay and i_episode >= 80:
                epsilon = 1.0 / (i_episode + 1)**2
                epsilon = 0
                policy = self.policy(p_table,
                                     0,
                                     self.action_n,
                                     ifcontinue=False)
                decay = False
            for turn in itertools.count():

                # Get the action properties for each one

                x, y = state
                state2 = env.state_transform(x, y)

                action_p = policy(state2)
                # Choose the action
                action_index = np.random.choice(range(self.action_n),
                                                p=action_p)
                action = self.action_space[action_index]

                # Take one step based on the action
                next_state, reward, done = env.step(action)

                returns = returns + discount_factor**turn * reward
                x, y = next_state
                next_state2 = env.state_transform(x, y)
                # Update the q_function
                target = reward + discount_factor * v_table[next_state2]
                td_error = target - v_table[state2]

                # Update
                # critic
                #p_table[state2][action_index] += alpha * td_error
                # actor
                #v_table[state2] += beta * td_error

                e_table_v[state2][0] = e_table_v[state2][0] + 1
                v_table += alpha * e_table_v * td_error
                e_table_v *= discount_factor * lambda2

                e_table_p[state2][
                    action_index] = e_table_p[state2][action_index] + 1
                p_table += alpha * e_table_p * td_error
                e_table_p *= discount_factor * lambda2

                if done:
                    result.append(returns)
                    break

                state = next_state
        return result
示例#18
0
    def q_learning_continue(self,
                            weight,
                            num_e=100,
                            discount_factor=1,
                            alpha=0.05,
                            lambda2=0,
                            epsilon=0.05,
                            mode="training",
                            step_limit=1008,
                            decay=False,
                            more=""):
        env = self.env
        policy = self.policy
        a_n = self.action_n
        td_errors = []
        results = []
        sample_states = []
        q_w = lambda x: weight.dot(x)
        dq_w = lambda x: x
        featurized = self.featurized

        if lambda2 <= 0:
            policy = policy(q_w, epsilon, self.action_n, ifcontinue=True)
            for i_episode in range(num_e):
                state = env.reset()
                state2 = env.normalize(state)
                phi_state = featurized(state2)
                not_done = True

                turn = 0
                returns = 0

                if decay and i_episode == 80:
                    # stop exploration

                    policy = self.policy
                    epsilon = 0
                    policy = policy(q_w,
                                    epsilon,
                                    self.action_n,
                                    ifcontinue=True)
                    decay = False

                while not_done:
                    action_p = policy(phi_state)
                    # Choose the action
                    try:
                        action_index = np.random.choice(range(self.action_n),
                                                        p=action_p)
                    except:
                        print("error")
                    action = self.action_space[action_index]

                    next_state, reward, done = env.step(action)
                    sample_states.append(np.array(next_state))

                    returns = returns + discount_factor**turn * reward
                    next_state2 = env.normalize(next_state)
                    phi_next_state = featurized(next_state2)

                    if decay and turn > step_limit:
                        #stop exploration

                        policy = self.policy
                        epsilon = 0
                        policy = policy(q_w,
                                        epsilon,
                                        self.action_n,
                                        ifcontinue=True)
                        decay = False

                    if done:
                        if turn > step_limit:
                            target = reward + discount_factor * max(
                                q_w(phi_next_state))
                        else:
                            target = reward + discount_factor * 0
                        q_phi_sate = q_w(phi_state)[action_index]
                        td_error = target - q_phi_sate
                        temp = alpha * td_error * dq_w(phi_state)
                        weight[action_index] += temp
                        q_w = lambda x: weight.dot(x)
                        dq_w = lambda x: x
                        results.append(returns)
                        break
                    # Function approximation TD_update
                    target = 0
                    try:
                        q_phi_next_state = q_w(phi_next_state)
                        target = reward + discount_factor * max(
                            q_phi_next_state)
                    except:
                        print("error")
                    q_phi_sate = q_w(phi_state)[action_index]
                    td_error = target - q_phi_sate

                    if mode == "training":
                        temp = alpha * td_error * dq_w(phi_state)
                        b = weight.dot(phi_state)
                        weight[action_index] += temp
                        try:
                            a = weight.dot(phi_state)
                        except:
                            print("error")

                        q_w = lambda x: weight.dot(x)
                        dq_w = lambda x: x

                    elif mode == "testing":
                        td_errors.append(td_error)

                    state = next_state

                    phi_state = phi_next_state
                    turn += 1

                    if done:
                        results.append(returns)
                        break

        sample_states = np.array(sample_states)
        print("q")
        print("min: ", np.min(sample_states, axis=0))
        print("max: ", np.max(sample_states, axis=0))
        print("mean: ", np.mean(sample_states, axis=0))
        print(np.mean(results), "max", np.max(results))
        return results
示例#19
0
    def sarsa_tabular(self,
                      num_e=100,
                      discount_factor=0.9,
                      alpha=0.05,
                      lambda2=0,
                      epsilon=0.05,
                      state_trans=False,
                      decay=True):
        # regular q_learning update
        env = self.env
        policy = self.policy
        result = []

        if lambda2 <= 0:
            q_table = np.zeros((self.state_n, self.action_n))
            policy = policy(q_table, epsilon, self.action_n, ifcontinue=False)

            for i_episode in range(num_e):
                returns = 0.000
                turn = 0
                state = env.reset()
                not_done = True

                if decay and i_episode >= 80:
                    epsilon = 0
                    policy = self.policy(q_table,
                                         epsilon,
                                         self.action_n,
                                         ifcontinue=False)
                    decay = False

                x, y = state
                state2 = env.state_transform(x, y)
                action_p = policy(state2)
                action_index = np.random.choice(range(self.action_n),
                                                p=action_p)
                action = self.action_space[action_index]

                while not_done:
                    next_state, reward, done = env.step(action)
                    returns = returns + discount_factor**turn * reward

                    if not state_trans:
                        x, y = next_state
                        next_state2 = env.state_transform(x, y)
                        x, y = state
                        state2 = env.state_transform(x, y)

                    #choose next action first
                    action_p = policy(state2)
                    next_action_index = np.random.choice(range(self.action_n),
                                                         p=action_p)
                    next_action = self.action_space[next_action_index]

                    if done:
                        target = reward + discount_factor * 0
                        td_error = target - q_table[state2][action_index]
                        q_table[state2][action_index] += alpha * td_error
                        result.append(returns)
                        break

                    # Update the q_function
                    target = reward + discount_factor * q_table[next_state2][
                        next_action_index]
                    td_error = target - q_table[state2][action_index]
                    q_table[state2][action_index] += alpha * td_error

                    state = next_state
                    action_index = next_action_index
                    action = next_action
                    turn += 1
        return result
示例#20
0
    def q_learning_tabular(self,
                           num_e=100,
                           discount_factor=0.9,
                           alpha=0.05,
                           lambda2=0,
                           epsilon=0.05,
                           state_trans=False,
                           degree=3,
                           decay=True):
        # regular q_learning update
        env = self.env
        policy = self.policy
        result = []

        if lambda2 <= 0:
            q_table = np.zeros((self.state_n, self.action_n))
            policy = policy(q_table, epsilon, self.action_n, ifcontinue=False)

            for i_episode in range(num_e):
                returns = 0.000
                turn = 0
                state = env.reset()
                not_done = True

                if decay and i_episode >= 80:
                    epsilon = 1.0 / (i_episode + 1)**2
                    epsilon = 0
                    policy = self.policy(q_table,
                                         epsilon,
                                         self.action_n,
                                         ifcontinue=False)
                    decay = False
                while not_done:

                    # Get the action properties for each one

                    x, y = state
                    state2 = env.state_transform(x, y)

                    action_p = policy(state2)
                    # Choose the action
                    action_index = np.random.choice(range(self.action_n),
                                                    p=action_p)
                    action = self.action_space[action_index]

                    #action = policy(self.action_space, state, q_table)

                    # Take one step based on the action
                    next_state, reward, done = env.step(action)

                    returns = returns + discount_factor**turn * reward

                    if not state_trans:
                        x, y = next_state
                        next_state2 = env.state_transform(x, y)

                    # Update the q_function
                    target = reward + discount_factor * np.max(
                        q_table[next_state2])
                    td_error = target - q_table[state2][action_index]
                    q_table[state2][action_index] += alpha * td_error

                    if done:
                        result.append(returns)
                        break

                    state = next_state
                    turn += 1

        elif lambda2 > 0:
            q_table = np.zeros((self.state_n, self.action_space.n))
            e_table = np.zeros((self.state_n, self.action_space.n))
            policy = policy(q_table, epsilon, self.action_space.n)
            # returns = np.zeros(num_e)

            for i_episode in range(num_e):
                state = env.reset()
                not_done = True
                while not_done:

                    # Get the action properties for each one
                    action_p = policy(state)
                    # Choose the action
                    action_index = np.random.choice(self.action_space,
                                                    p=action_p)
                    action = self.action_space[action_index]

                    # Take one step based on the action
                    next_state, reward, done = env.step(action)

                    if not state_trans:
                        x, y = next_state
                        next_state2 = env.state_transform(x, y)
                        x, y = state
                        state2 = env.state_transform(x, y)

                    # Update the q_function
                    target = reward + discount_factor * np.max(
                        q_table[next_state2])
                    td_error = target - q_table[state2][action_index]
                    q_table[state2][action_index] += alpha * td_error

                    # update e:
                    e_table[state2][
                        action_index] = e_table[state2][action_index] + 1
                    q_table += alpha * e_table * td_error
                    e_table *= discount_factor * lambda2 * e_table

                    if done:
                        result.append(returns)

                    state = next_state
        return result
示例#21
0
def k_step_TD(env, policy, k, alpha, num_episodes):
    # size of v = (rawsum, number of distinct trumps, dealer's hand)
    v = np.zeros((61, 4, 10), dtype=float)

    for _ in tqdm(range(num_episodes)):
        # print("====================== NEW EPISODE ======================")
        states = []
        state = env.reset()
        state, reward, done = env.check_after_init()
        if done:
            # no actionable state encountered in this episode so no update
            continue
        states.append(copy(state))

        # take k-1 steps
        for _ in range(k - 1):
            action = policy(state)
            state, reward, done = env.step(action)
            if done:
                break
            states.append(copy(state))

        if not done:
            assert (len(states) == k), "number of states not correct"

        if (not done):
            while (True):
                action = policy(state)
                state, reward, done = env.step(action)
                if done:
                    break
                assert (
                    reward == 0), "reward is non-zero for intermediate states"
                # update S_t, remove from states list and add S_t+k to the states list
                initial_state = state_transformation(states[0])
                final_state = state_transformation(state)
                v[initial_state] += alpha * (reward + v[final_state] -
                                             v[initial_state])
                states = states[1:] + [copy(state)]

        assert (states[-1] != None), "states[-1] is None"

        # if states[-1] != None:
        #     raise Exception("last state in episode is actionable, CHECK")
        # states = states[:-1]

        for s in states:
            assert (s.category == "GENERAL"
                    ), "states within an episode are not actionable"
            # if s.category=="BUST" or s.category=="SUM31":
            #     raise Exception("states within an episode are not actionable")
            # else:
            #     s.print()

        # updating value of states after reaching end of episode
        for s in states:
            initial_state = state_transformation(s)
            v[initial_state] += alpha * (
                reward - v[initial_state]
            )  # last state is not actionable so its value is zero

    return v
示例#22
0
    def run_tabular(self,
                    discount_factor=0.9,
                    alpha=0.05,
                    lambda2=0.5,
                    epsilon=0.05,
                    state_trans=False,
                    decay=True):
        # regular q_learning update
        env = self.env
        policy = self.policy
        result = []

        q_table = np.zeros((self.state_n, self.action_n))
        e_table = np.zeros((self.state_n, self.action_n))
        policy = policy(q_table, epsilon, self.action_n, ifcontinue=False)

        for i_episode in range(self.num_e):
            returns = 0.000
            turn = 0
            state = env.reset()
            not_done = True

            if decay and i_episode >= 80:
                epsilon = 0
                policy = self.policy(q_table,
                                     epsilon,
                                     self.action_n,
                                     ifcontinue=False)
                decay = False

            x, y = state
            state2 = env.state_transform(x, y)
            action_p = policy(state2)
            action_index = np.random.choice(range(self.action_n), p=action_p)
            action = self.action_space[action_index]

            for turn in itertools.count():
                next_state, reward, done = env.step(action)
                returns = returns + discount_factor**turn * reward

                x, y = next_state
                next_state2 = env.state_transform(x, y)
                x, y = state
                state2 = env.state_transform(x, y)

                #choose next action first
                action_p = policy(state2)
                next_action_index = np.random.choice(range(self.action_n),
                                                     p=action_p)
                next_action = self.action_space[next_action_index]

                # Update the q_function
                target = reward + discount_factor * q_table[next_state2][
                    next_action_index]
                td_error = target - q_table[state2][action_index]
                #q_table[state2][action_index] += alpha * td_error

                # Update e

                e_table[state2][action_index] = (
                    e_table[state2][action_index] + 1) * td_error
                q_table += alpha * e_table
                e_table *= discount_factor * lambda2

                if done:
                    result.append(returns)
                    break

                state = next_state
                action_index = next_action_index
                action = next_action
        return result
示例#23
0
"""
Library of interesting policies

policy(state) -> GameAD


Note: make sure at least one legal move is suggested by the policy
"""
from policy import *
from examples.mancala import GameState


def random_policy():
    """random moves"""
    @Policy
    def _random_policy(state: GameState):
        board_size = len(state)
        return [1 for _ in range(board_size)]

    return _random_policy
示例#24
0
def generate_data(env, phi, policy, episode_limit=None, step_limit=None):
	"""
	Generate data for the environment, given a policy, and a function 
	approximator, up to the number of steps or episodes specified. 

	At least one of `episode_limit` or `step_limit` has to be specified. 

	Parameters
	----------
	env : Environment
		An environment from which to generate the data. 

	phi : function 
		A function which maps observations to feature vectors.

	policy : function
		A policy function which maps observations to actions.

	episode_limit : int (optional)
		An integer which specifies how many episodes of data to generate. 

	step_limit : int (optional)
		An integer which specifies how many steps of data to generate.
	
	Returns
	-------
	obs_lst : list of observations 

	fvec_lst : list of feature vectors

	act_lst : list of actions  

	reward_lst :  list of rewards 
	"""
	if episode_limit is None:
		episode_limit = sys.maxsize
	if step_limit is None:
		step_limit = sys.maxsize 

	# Check that at least one of `episode_limit` or `step_limit` has been given
	assert(episode_limit < sys.maxsize or step_limit < sys.maxsize)
	episode_count 	= 0
	step_count 		= 0	

	# Set up the data containers
	obs_lst 		= []
	fvec_lst 		= []
	act_lst 		= []
	reward_lst 		= []

	while (step_count < step_limit and episode_count < episode_limit):
			# Take a single step according to the policy
			obs 			= env.observe()
			fvec 			= phi(obs)
			act 			= policy(fvec)
			reward, obs_p 	= env.do(act)
			# Record a step of the episode
			obs_lst.append(obs) 
			fvec_lst.append(fvec)
			act_lst.append(act)
			reward_lst.append(reward)
			
			if env.is_terminal():
				step_count += 1
				fvec 	= phi(obs_p)
				act 	= policy(fvec)
				reward  = 0

				# Record terminal state data
				obs_lst.append(obs_p)
				fvec_lst.append(fvec)
				act_lst.append(act)
				reward_lst.append(reward)
				env.reset()
				episode_count += 1

			step_count += 1

	return obs_lst, fvec_lst, act_lst, reward_lst