def get_episode(mdp, start, episode_length, policy): trace = [] rewards = [] actions = [] current_state = start for j in range(episode_length): trace.append(current_state) next_action = policy(current_state) actions.append(next_action) current_reward = mdp.next_reward(current_state, next_action) rewards.append(current_reward) current_state = mdp.next_state(current_state, policy(current_state)) return trace, rewards, actions
def sample_episode( self, policy: Policy, start_state: Optional[_State] = None, max_len: Optional[int] = None ) -> List[Tuple[_State, _Action, float, float]]: """ :param policy: :param start_state: :param max_len: :return: trajectory wherein each member is (state, action, reward, action-probability) """ self.reset(start_state) idx = 1 trajectory = [] while (max_len is None) or (idx <= max_len): idx += 1 state = self.get_state() weighted_action = policy( self.censor_state(state)).sample_weighted() action = weighted_action.value reward, new_state, terminal = self.sample(action) trajectory.append((self.censor_state(state), action, reward, weighted_action.weight)) if terminal: break return trajectory
def episode(env, policy): state = env.reset() state, reward, done = env.check_after_init() while (not done): action = policy(state) state, reward, done = env.step(action) return reward
def apply_policy(self, policy: Policy) -> float: if not self.system.is_terminal(self.state): action = policy(self.state).sample() reward, new_state = self.sample_result(self.state, action) self.state = new_state return reward else: return 0.
def optimize_policy( self, initial: Policy, eval_threshold: float = 0.01, use_value_update: bool = False, greedy_prob: float = 0.5, action_stability_margin: float = 0.0001, round_v: Optional[int] = None ) -> Tuple[Policy, Dict[_State, float]]: if use_value_update: V, policy = self.evaluate_policy( initial.clone(), threshold=eval_threshold, greedy_prob=greedy_prob, action_stability_margin=action_stability_margin, round_v=round_v) else: stable = False policy = initial.clone() iteration = 1 while not stable: print("Commencing policy optimization iteration: {}".format( iteration)) changed_actions = 0 V, _ = self.evaluate_policy(policy, threshold=eval_threshold, round_v=round_v) stable = True for state in self.system.states: old_action = policy(state).sample( ) # sampling from deterministic policy is deterministic best_val = float('-inf') best_action = None for action in self.system.actions: action_reward = float('-inf') outcomes = self.system.dynamics.get((state, action)) if outcomes is not None: action_reward = expectation( self.system.dynamics.get((state, action)), lambda outcome: outcome[0] + self.discount * V[ outcome[1]]) if action_reward > best_val + action_stability_margin: best_val = action_reward best_action = action if best_action != old_action: stable = False policy.update(state, best_action) changed_actions += 1 print("{} actions changed after iteration {}".format( changed_actions, iteration)) iteration += 1 return policy, V
def td_update_tabular(self, v_table, num_e=100, discount_factor=0.9, alpha=0.05, mode="training", state_trans=False): env = self.env policy = self.policy #v_table = np.zeros(self.state_n) td_errors = [] for i_episode in range(num_e): state = env.reset() not_done = True while not_done: action = policy(self.action_space, state, v_table) # Take one step based on the action choosing: next_state, reward, done = env.step(action) if not state_trans: x, y = next_state next_state2 = env.state_transform(x, y) x, y = state state2 = env.state_transform(x, y) # TD update target = reward + discount_factor * v_table[next_state2] if reward > 1 or reward < 0: nihao = 1 #print ("reward", reward, "state: ", state, "next state", next_state) td_error = target - v_table[state2] if mode == "training": v_table[state2] += alpha * td_error #print("state v", v_table[state2]) elif mode == "testing": td_errors.append(td_error) state = next_state if done: break result = 0 if mode == "training": result = v_table elif mode == "testing": result = td_errors return result
def simulate(self, policy, id=None): states, rewards, actions, next_states, terminates = [], [], [], [], [] s = self.env.reset(id) terminate = False while not terminate: a = policy(s.unsqueeze(0)).view(-1) next_state, r, terminate = self.env.step(a) states.append(s) rewards.append(r) actions.append(onehots(a, self.env.w)) next_states.append(next_state) terminates.append(terminate) s = next_state return states, rewards, actions, next_states, terminates
def monte_carlo(env, policy, first_visit, num_episodes): # size of v = (rawsum, number of distinct trumps, dealer's hand) v = np.zeros((61, 4, 10), dtype=float) num_updates = np.zeros((61, 4, 10), dtype=float) for _ in tqdm(range(num_episodes)): # print("====================== NEW EPISODE ======================") states = [] state = env.reset() state, reward, done = env.check_after_init() if done: # no actionable state encountered in this episode so no update continue states.append(copy(state)) while (not done): action = policy(state) state, reward, done = env.step(action) states.append(copy(state)) if states[-1] != None: raise Exception("last state in episode is actionable, CHECK") states = states[:-1] for s in states: if s.category == "BUST" or s.category == "SUM31": raise Exception("states within an episode are not actionable") # updating value function if first_visit: states = list(set(states)) for state in states: transformed_state = state_transformation(state) v[transformed_state] += reward num_updates[transformed_state] += 1 v = v / ( num_updates + 1e-5 ) # not replacing nan with zeros to know which states were not updated return v
def evaluate_policy(self, policy: Policy, threshold: float = 0.01, greedy_prob: float = 0., action_stability_margin: float = 0.0001, round_v: Optional[int] = None) -> Dict[_State, float]: if self.V is None: self.V = {state: 0. for state in self.system.states} max_error = threshold * 2 iteration = 1 policy_stable = (greedy_prob == 0.) while (max_error > threshold) or not policy_stable: stablize_policy = (max_error <= threshold) if stablize_policy: print("Stabilizing policy") policy_changed = False max_error = 0. for state in self.system.states: if not self.system.is_terminal(state): action_dist = policy(state) greedy = stablize_policy or (random.random() < greedy_prob) if greedy: best_action = None best_reward = float('-inf') for action in self.system.actions: action_reward = expectation( self.system.dynamics.get((state, action), []), lambda outcome: outcome[ 0] + self.discount * self.V[outcome[1]]) if action_reward > best_reward + action_stability_margin: best_reward = action_reward best_action = action v = best_reward first = next(iter(action_dist)) if (first.weight < 1.) or (first.value != best_action): policy.update(state, best_action) policy_changed = True else: v = 0. for action in action_dist: action_reward = expectation( self.system.dynamics.get((state, action.value), []), lambda outcome: outcome[ 0] + self.discount * self.V[outcome[1]]) pi_action = action.weight v += pi_action * action_reward else: v = 0. error = abs(v - self.V[state]) if error > max_error: max_error = error self.V[state] = v if round_v is None else (round(v, round_v)) print("After {} iterations max_error is {}".format( iteration, max_error)) iteration += 1 if stablize_policy and not policy_changed: policy_stable = True print("Policy is stable") return self.V, policy
from render import * NUM_AGENTS = 4 ENV = 'env_0' loc_dict = {0: loc(1, 1), 1: loc(5, 5), 2: loc(5, 1), 3: loc(1, 5)} DIM = (7, 7) SCALE = 25 # setup env e = Env(NUM_AGENTS, ENV, DIM, _loc_dict=loc_dict, _obs_type='one_hot') # initialize pygame window = window(SCALE, (e.height, e.width)) policies = {} for i in range(0, e.num_agents): policies[i] = policy(_e0=1) for n in range(0, 10000): for event in pygame.event.get(): if (event.type == pygame.QUIT): quit() window.render(e, 0.5) # rgb = pygame.surfarray.array3d(screen) action_list = [] for i in range(0, e.num_agents): action_space = e.action_space(i) opt_action = policies[i].action(action_space) action_list.append(opt_action) print(action_dict[opt_action]) observation, reward, done = e.step(action_list) for i in range(0, e.num_agents):
def run_one_tabular(self,discount_factor = 0.9): # regular q_learning update env = self.env policy = self.policy D = [] Return = [] Trajectory = [] q_table = self.policy_table policy = policy(q_table) for i_episode in range(self.num_e): returns = 0.000 state = env.reset() trajectory = [] for turn in itertools.count(): # Get the action properties for each one x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) # Choose the action action_index = 1 try: action_index = np.random.choice(range(self.action_n), p=action_p) except: print("error") action = self.action_space[action_index] trajectory.append([state2, action_index]) # Take one step based on the action next_state, reward, done = env.step(action) returns = returns + discount_factor ** turn * reward x, y = next_state next_state2 = env.state_transform(x, y) if done: Return.append(returns) Trajectory.append(trajectory) #result.append([trajectory, returns]) break state = next_state #split result into training and testing n_D = len(Trajectory) testing_index = int(n_D/self.RATIO) training_r = Return[:testing_index] testing_r = Return[testing_index:] training = Trajectory[:testing_index] testing = Trajectory[testing_index:] D_training = [training,training_r] D_testing = [testing,testing_r] best_safty_policy = self.quasi_seldonian(0.5, D_training, D_testing, len(D_testing[0]), self.delta) return best_safty_policy, np.mean(Return)
def run_Tabular(self, alpha=0.005, beta=0.0001, discount_factor=0.9, decay=True): # regular q_learning update env = self.env policy = self.policy result = [] v_table = np.zeros((self.state_n, 1)) # critic p_table = np.zeros((self.state_n, self.action_n)) # actor policy = policy(p_table, 1, self.action_n, ifcontinue=False) for i_episode in range(self.num_e): returns = 0.000 state = env.reset() not_done = True if decay and i_episode >= 80: epsilon = 1.0 / (i_episode + 1)**2 epsilon = 0 policy = self.policy(p_table, 0, self.action_n, ifcontinue=False) decay = False episodes = [] for turn in itertools.count(): # Get the action properties for each one x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) # Choose the action action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] # Take one step based on the action next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward x, y = next_state next_state2 = env.state_transform(x, y) episodes.append((state2, action_index, reward)) if done: #print("one episode", returns) result.append(returns) break state = next_state last_reward = 0 update = 0 for i in range(len(episodes)): # Get the action properties for each one state2, action_index, reward = episodes[i] if i + 1 < len(episodes): next_state2, next_action_index, _ = episodes[i + 1] v_next_state = v_table[next_state2] else: v_next_state = 0 # Update the q_function #target = reward + discount_factor * v_next_state #td_error = target - v_table[state2] if i == 0: update = returns else: update = (update - last_reward) / discount_factor - v_table[state2] #update = (update - discount_factor**(turn-1)*last_reward) - v_table[state2] # Update p_table[state2][action_index] += alpha * update #v_table[state2] += beta * td_error last_reward = reward return result
def run(self, alpha=0.005, discount_factor=1, lambda2=0, epsilon=0.05, decay=False): results = [] normalized = self.normalized featurized = self.featurized #todo: didn't write "decay" case env = self.env policy = self.policy critic = Util.Linear_Approximator(normalized, featurized, lambda2=lambda2, n_feature=self.n_featurized, alpha=alpha, action_n=self.action_n) critic.initial() policy = policy(critic.perdict, epsilon, self.action_n, ifcontinue=True) for i_e in range(self.num_e): state = env.reset() state2 = normalized(state) phi_state = featurized(state2) returns = 0 action_p = policy(phi_state) action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] for turn in itertools.count(): next_state, reward, done = env.step(action) next_state2 = normalized(next_state) phi_next_state = featurized(next_state2) action_p = policy(phi_next_state) next_action_index = np.random.choice(range(self.action_n), p=action_p) next_action = self.action_space[next_action_index] #Caculate Return returns = returns + discount_factor**turn * reward if done: target = reward + discount_factor * 0 critic.update(phi_state, target, action_index) results.append(returns) break if turn == 1000: results.append(returns) break # Function approximation TD_update q_next_state = critic.perdict(phi_next_state, a=-1) target = reward + discount_factor * q_next_state[ next_action_index] q_phi_sate = critic.perdict(phi_state, action_index) td_error = target - q_phi_sate critic.update(phi_state, target, action_index) state = next_state action = next_action action_index = next_action_index phi_state = phi_next_state #print("q") #print(np.mean(results), "max", np.max(results)) return results
def td_update_continue(self, weight, num_e=100, discount_factor=1, alpha=0.05, mode="training", degree=3): env = self.env policy = self.policy a_n = self.action_n #n_out_features = Util.get_n_features(degree) # weight = np.zeros(n_out_features) v_w = lambda x: weight.dot(x) dv_w = lambda x: x #featurized = lambda x: Util.Fourier_Kernel(x, degree) featurized = self.featurized td_errors = [] for i_episode in range(num_e): state = env.reset() state2 = env.normalize(state) phi_state = featurized([state2]) phi_state = phi_state[0] not_done = True while not_done: action = policy(self.action_space, phi_state, v_w) next_state, reward, done = env.step(action) next_state2 = env.normalize(next_state) #phi_next_state = featurized([next_state2]) phi_next_state = featurized(next_state2) phi_next_state = phi_next_state[0] # Function approximation TD_update try: v_phi_next_state = v_w(phi_next_state) target = reward + discount_factor * v_w(phi_next_state) except: print("error") v_phi_sate = v_w(phi_state) td_error = target - v_w(phi_state) if mode == "training": weight += alpha * td_error * dv_w(phi_state) try: a = weight.dot(phi_state) except: print("error") v_w = lambda x: weight.dot(x) dv_w = lambda x: x elif mode == "testing": td_errors.append(td_error) state = next_state phi_state = phi_next_state if done: break result = 0 if mode == "training": result = weight elif mode == "testing": result = td_errors return result
def rollout(self, policy: policy.BasePolicy, max_step: float = 100, frame_skip: int = 0, gamma: float = 1.0): self.race.restart() self.race.step(pystk.Action()) self.track.update() result = list() state = pystk.WorldState() state.update() # r_total? r_total = 0 # distance d = state.karts[0].distance_down_track # s s = np.array(self.race.render_data[0].image) off_track = deque(maxlen=20) traveled = deque(maxlen=50) for it in range(max_step): # Early termination. if it > 20 and (np.median(traveled) < 0.05 or all(off_track)): break velocity = np.linalg.norm(state.karts[0].velocity) action, action_index, p_action = policy(s, velocity) if isinstance(action, pystk.Action): action_raw = [action.steer, action.acceleration, action.drift] else: action_raw = action action = pystk.Action() action.steer = action_raw[0] action.acceleration = np.clip(action_raw[1] - velocity, 0, np.inf) action.drift = action_raw[2] > 0.5 for _ in range(1 + frame_skip): self.race.step(action) self.track.update() state = pystk.WorldState() state.update() s_p = np.array(self.race.render_data[0].image) d_new = min(state.karts[0].distance_down_track, d + 5.0) node_idx = np.searchsorted(self.track.path_distance[:, 1], d_new % self.track.path_distance[-1, 1]) % len( self.track.path_nodes) a_b = self.track.path_nodes[node_idx] distance = point_from_line(state.karts[0].location, a_b[0], a_b[1]) distance_traveled = get_distance(d_new, d, self.track.path_distance[-1, 1]) gain = distance_traveled if distance_traveled > 0 else 0 mult = int(distance < 6.0) traveled.append(gain) off_track.append(distance > 6.0) r_total = max(r_total, d_new * mult) r = np.clip(0.5 * max(mult * gain, 0) + 0.5 * mult, -1.0, 1.0) result.append( Data(s.copy(), np.float32(action_raw), np.uint8([action_index]), np.float32([p_action]), np.float32([r]), s_p.copy(), np.float32([np.nan]), np.float32([0]))) d = d_new s = s_p G = 0 # Ugly. for i, data in enumerate(reversed(result)): G = data.r + gamma * G result[-(i + 1)] = Data(data.s, data.a, data.a_i, data.p_a, data.r, data.sp, np.float32([G]), np.float32([i == 0])) # HACK PLEASE REMEMBER THIS return result[4:], r_total / self.track.path_distance[-1, 1]
def sarsa_continue(self, weight, num_e=100, discount_factor=1, alpha=0.05, lambda2=0, epsilon=0.05, mode="training", step_limit=1008, decay=False, more=""): env = self.env policy = self.policy a_n = self.action_n td_errors = [] results = [] q_w = lambda x: weight.dot(x) dq_w = lambda x: x featurized = self.featurized if lambda2 <= 0: policy = policy(q_w, epsilon, self.action_n, ifcontinue=True) for i_episode in range(num_e): state = env.reset() state2 = env.normalize(state) phi_state = featurized(state2) not_done = True turn = 0 returns = 0 #choose the first action action_p = policy(phi_state) # Choose the action action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] if decay and i_episode == 80: # stop exploration policy = self.policy epsilon = 0 policy = policy(q_w, epsilon, self.action_n, ifcontinue=True) decay = False while not_done: next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward next_state2 = env.normalize(next_state) phi_next_state = featurized(next_state2) # choose the first action action_p = policy(phi_next_state) # Choose the action next_action_index = np.random.choice(range(self.action_n), p=action_p) next_action = self.action_space[next_action_index] if done: q_phi_sate = q_w(phi_state)[action_index] q_phi_next_state = q_w(phi_next_state) if turn > step_limit: target = reward + discount_factor * q_phi_next_state[ next_action_index] else: target = reward + discount_factor * 0 td_error = target - q_phi_sate temp = alpha * td_error * dq_w(phi_state) weight[action_index] += temp q_w = lambda x: weight.dot(x) dq_w = lambda x: x results.append(returns) break # Function approximation TD_update q_phi_next_state = q_w(phi_next_state) target = reward + discount_factor * q_phi_next_state[ next_action_index] q_phi_sate = q_w(phi_state)[action_index] td_error = target - q_phi_sate if mode == "training": temp = alpha * td_error * dq_w(phi_state) a = weight.dot(phi_state) weight[action_index] += temp q_w = lambda x: weight.dot(x) b = q_w(phi_state) dq_w = lambda x: x elif mode == "testing": td_errors.append(td_error) state = next_state phi_state = phi_next_state action = next_action action_index = next_action_index turn += 1 print("sarsa") print(np.mean(results), "max", np.max(results)) return results
def run_Tabular(self, alpha=0.005, beta=0.0001, discount_factor=0.9, decay=True, lambda2=0.8): # regular q_learning update env = self.env policy = self.policy result = [] v_table = np.zeros((self.state_n, 1)) # critic p_table = np.zeros((self.state_n, self.action_n)) # actor policy = policy(p_table, 1, self.action_n, ifcontinue=False) e_table_v = np.zeros((self.state_n, 1)) e_table_p = np.zeros((self.state_n, self.action_n)) for i_episode in range(self.num_e): returns = 0.000 turn = 0 state = env.reset() not_done = True if decay and i_episode >= 80: epsilon = 1.0 / (i_episode + 1)**2 epsilon = 0 policy = self.policy(p_table, 0, self.action_n, ifcontinue=False) decay = False for turn in itertools.count(): # Get the action properties for each one x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) # Choose the action action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] # Take one step based on the action next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward x, y = next_state next_state2 = env.state_transform(x, y) # Update the q_function target = reward + discount_factor * v_table[next_state2] td_error = target - v_table[state2] # Update # critic #p_table[state2][action_index] += alpha * td_error # actor #v_table[state2] += beta * td_error e_table_v[state2][0] = e_table_v[state2][0] + 1 v_table += alpha * e_table_v * td_error e_table_v *= discount_factor * lambda2 e_table_p[state2][ action_index] = e_table_p[state2][action_index] + 1 p_table += alpha * e_table_p * td_error e_table_p *= discount_factor * lambda2 if done: result.append(returns) break state = next_state return result
def q_learning_continue(self, weight, num_e=100, discount_factor=1, alpha=0.05, lambda2=0, epsilon=0.05, mode="training", step_limit=1008, decay=False, more=""): env = self.env policy = self.policy a_n = self.action_n td_errors = [] results = [] sample_states = [] q_w = lambda x: weight.dot(x) dq_w = lambda x: x featurized = self.featurized if lambda2 <= 0: policy = policy(q_w, epsilon, self.action_n, ifcontinue=True) for i_episode in range(num_e): state = env.reset() state2 = env.normalize(state) phi_state = featurized(state2) not_done = True turn = 0 returns = 0 if decay and i_episode == 80: # stop exploration policy = self.policy epsilon = 0 policy = policy(q_w, epsilon, self.action_n, ifcontinue=True) decay = False while not_done: action_p = policy(phi_state) # Choose the action try: action_index = np.random.choice(range(self.action_n), p=action_p) except: print("error") action = self.action_space[action_index] next_state, reward, done = env.step(action) sample_states.append(np.array(next_state)) returns = returns + discount_factor**turn * reward next_state2 = env.normalize(next_state) phi_next_state = featurized(next_state2) if decay and turn > step_limit: #stop exploration policy = self.policy epsilon = 0 policy = policy(q_w, epsilon, self.action_n, ifcontinue=True) decay = False if done: if turn > step_limit: target = reward + discount_factor * max( q_w(phi_next_state)) else: target = reward + discount_factor * 0 q_phi_sate = q_w(phi_state)[action_index] td_error = target - q_phi_sate temp = alpha * td_error * dq_w(phi_state) weight[action_index] += temp q_w = lambda x: weight.dot(x) dq_w = lambda x: x results.append(returns) break # Function approximation TD_update target = 0 try: q_phi_next_state = q_w(phi_next_state) target = reward + discount_factor * max( q_phi_next_state) except: print("error") q_phi_sate = q_w(phi_state)[action_index] td_error = target - q_phi_sate if mode == "training": temp = alpha * td_error * dq_w(phi_state) b = weight.dot(phi_state) weight[action_index] += temp try: a = weight.dot(phi_state) except: print("error") q_w = lambda x: weight.dot(x) dq_w = lambda x: x elif mode == "testing": td_errors.append(td_error) state = next_state phi_state = phi_next_state turn += 1 if done: results.append(returns) break sample_states = np.array(sample_states) print("q") print("min: ", np.min(sample_states, axis=0)) print("max: ", np.max(sample_states, axis=0)) print("mean: ", np.mean(sample_states, axis=0)) print(np.mean(results), "max", np.max(results)) return results
def sarsa_tabular(self, num_e=100, discount_factor=0.9, alpha=0.05, lambda2=0, epsilon=0.05, state_trans=False, decay=True): # regular q_learning update env = self.env policy = self.policy result = [] if lambda2 <= 0: q_table = np.zeros((self.state_n, self.action_n)) policy = policy(q_table, epsilon, self.action_n, ifcontinue=False) for i_episode in range(num_e): returns = 0.000 turn = 0 state = env.reset() not_done = True if decay and i_episode >= 80: epsilon = 0 policy = self.policy(q_table, epsilon, self.action_n, ifcontinue=False) decay = False x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] while not_done: next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward if not state_trans: x, y = next_state next_state2 = env.state_transform(x, y) x, y = state state2 = env.state_transform(x, y) #choose next action first action_p = policy(state2) next_action_index = np.random.choice(range(self.action_n), p=action_p) next_action = self.action_space[next_action_index] if done: target = reward + discount_factor * 0 td_error = target - q_table[state2][action_index] q_table[state2][action_index] += alpha * td_error result.append(returns) break # Update the q_function target = reward + discount_factor * q_table[next_state2][ next_action_index] td_error = target - q_table[state2][action_index] q_table[state2][action_index] += alpha * td_error state = next_state action_index = next_action_index action = next_action turn += 1 return result
def q_learning_tabular(self, num_e=100, discount_factor=0.9, alpha=0.05, lambda2=0, epsilon=0.05, state_trans=False, degree=3, decay=True): # regular q_learning update env = self.env policy = self.policy result = [] if lambda2 <= 0: q_table = np.zeros((self.state_n, self.action_n)) policy = policy(q_table, epsilon, self.action_n, ifcontinue=False) for i_episode in range(num_e): returns = 0.000 turn = 0 state = env.reset() not_done = True if decay and i_episode >= 80: epsilon = 1.0 / (i_episode + 1)**2 epsilon = 0 policy = self.policy(q_table, epsilon, self.action_n, ifcontinue=False) decay = False while not_done: # Get the action properties for each one x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) # Choose the action action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] #action = policy(self.action_space, state, q_table) # Take one step based on the action next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward if not state_trans: x, y = next_state next_state2 = env.state_transform(x, y) # Update the q_function target = reward + discount_factor * np.max( q_table[next_state2]) td_error = target - q_table[state2][action_index] q_table[state2][action_index] += alpha * td_error if done: result.append(returns) break state = next_state turn += 1 elif lambda2 > 0: q_table = np.zeros((self.state_n, self.action_space.n)) e_table = np.zeros((self.state_n, self.action_space.n)) policy = policy(q_table, epsilon, self.action_space.n) # returns = np.zeros(num_e) for i_episode in range(num_e): state = env.reset() not_done = True while not_done: # Get the action properties for each one action_p = policy(state) # Choose the action action_index = np.random.choice(self.action_space, p=action_p) action = self.action_space[action_index] # Take one step based on the action next_state, reward, done = env.step(action) if not state_trans: x, y = next_state next_state2 = env.state_transform(x, y) x, y = state state2 = env.state_transform(x, y) # Update the q_function target = reward + discount_factor * np.max( q_table[next_state2]) td_error = target - q_table[state2][action_index] q_table[state2][action_index] += alpha * td_error # update e: e_table[state2][ action_index] = e_table[state2][action_index] + 1 q_table += alpha * e_table * td_error e_table *= discount_factor * lambda2 * e_table if done: result.append(returns) state = next_state return result
def k_step_TD(env, policy, k, alpha, num_episodes): # size of v = (rawsum, number of distinct trumps, dealer's hand) v = np.zeros((61, 4, 10), dtype=float) for _ in tqdm(range(num_episodes)): # print("====================== NEW EPISODE ======================") states = [] state = env.reset() state, reward, done = env.check_after_init() if done: # no actionable state encountered in this episode so no update continue states.append(copy(state)) # take k-1 steps for _ in range(k - 1): action = policy(state) state, reward, done = env.step(action) if done: break states.append(copy(state)) if not done: assert (len(states) == k), "number of states not correct" if (not done): while (True): action = policy(state) state, reward, done = env.step(action) if done: break assert ( reward == 0), "reward is non-zero for intermediate states" # update S_t, remove from states list and add S_t+k to the states list initial_state = state_transformation(states[0]) final_state = state_transformation(state) v[initial_state] += alpha * (reward + v[final_state] - v[initial_state]) states = states[1:] + [copy(state)] assert (states[-1] != None), "states[-1] is None" # if states[-1] != None: # raise Exception("last state in episode is actionable, CHECK") # states = states[:-1] for s in states: assert (s.category == "GENERAL" ), "states within an episode are not actionable" # if s.category=="BUST" or s.category=="SUM31": # raise Exception("states within an episode are not actionable") # else: # s.print() # updating value of states after reaching end of episode for s in states: initial_state = state_transformation(s) v[initial_state] += alpha * ( reward - v[initial_state] ) # last state is not actionable so its value is zero return v
def run_tabular(self, discount_factor=0.9, alpha=0.05, lambda2=0.5, epsilon=0.05, state_trans=False, decay=True): # regular q_learning update env = self.env policy = self.policy result = [] q_table = np.zeros((self.state_n, self.action_n)) e_table = np.zeros((self.state_n, self.action_n)) policy = policy(q_table, epsilon, self.action_n, ifcontinue=False) for i_episode in range(self.num_e): returns = 0.000 turn = 0 state = env.reset() not_done = True if decay and i_episode >= 80: epsilon = 0 policy = self.policy(q_table, epsilon, self.action_n, ifcontinue=False) decay = False x, y = state state2 = env.state_transform(x, y) action_p = policy(state2) action_index = np.random.choice(range(self.action_n), p=action_p) action = self.action_space[action_index] for turn in itertools.count(): next_state, reward, done = env.step(action) returns = returns + discount_factor**turn * reward x, y = next_state next_state2 = env.state_transform(x, y) x, y = state state2 = env.state_transform(x, y) #choose next action first action_p = policy(state2) next_action_index = np.random.choice(range(self.action_n), p=action_p) next_action = self.action_space[next_action_index] # Update the q_function target = reward + discount_factor * q_table[next_state2][ next_action_index] td_error = target - q_table[state2][action_index] #q_table[state2][action_index] += alpha * td_error # Update e e_table[state2][action_index] = ( e_table[state2][action_index] + 1) * td_error q_table += alpha * e_table e_table *= discount_factor * lambda2 if done: result.append(returns) break state = next_state action_index = next_action_index action = next_action return result
""" Library of interesting policies policy(state) -> GameAD Note: make sure at least one legal move is suggested by the policy """ from policy import * from examples.mancala import GameState def random_policy(): """random moves""" @Policy def _random_policy(state: GameState): board_size = len(state) return [1 for _ in range(board_size)] return _random_policy
def generate_data(env, phi, policy, episode_limit=None, step_limit=None): """ Generate data for the environment, given a policy, and a function approximator, up to the number of steps or episodes specified. At least one of `episode_limit` or `step_limit` has to be specified. Parameters ---------- env : Environment An environment from which to generate the data. phi : function A function which maps observations to feature vectors. policy : function A policy function which maps observations to actions. episode_limit : int (optional) An integer which specifies how many episodes of data to generate. step_limit : int (optional) An integer which specifies how many steps of data to generate. Returns ------- obs_lst : list of observations fvec_lst : list of feature vectors act_lst : list of actions reward_lst : list of rewards """ if episode_limit is None: episode_limit = sys.maxsize if step_limit is None: step_limit = sys.maxsize # Check that at least one of `episode_limit` or `step_limit` has been given assert(episode_limit < sys.maxsize or step_limit < sys.maxsize) episode_count = 0 step_count = 0 # Set up the data containers obs_lst = [] fvec_lst = [] act_lst = [] reward_lst = [] while (step_count < step_limit and episode_count < episode_limit): # Take a single step according to the policy obs = env.observe() fvec = phi(obs) act = policy(fvec) reward, obs_p = env.do(act) # Record a step of the episode obs_lst.append(obs) fvec_lst.append(fvec) act_lst.append(act) reward_lst.append(reward) if env.is_terminal(): step_count += 1 fvec = phi(obs_p) act = policy(fvec) reward = 0 # Record terminal state data obs_lst.append(obs_p) fvec_lst.append(fvec) act_lst.append(act) reward_lst.append(reward) env.reset() episode_count += 1 step_count += 1 return obs_lst, fvec_lst, act_lst, reward_lst