def train(self, nb_episode, nb_simulation): s = environment.State() likelihood_list = np.zeros(nb_episode) reward_list = np.zeros(nb_episode) if self.model is not None: self.model.reset_observation() for episode in range(nb_episode): old_s1, old_s2 = self.env.reset() nb_step = 0 # old_s1, old_s2 = 0, 0 done = False while not done: s, reward, done = self._learn(old_s1, old_s2) # print(f'{old_s1}, {old_s2}, {s.s1}, {s.s2}') # simulation if self.model is not None: for _ in range(nb_simulation): self.simulate() real_likelihood = self.env.get_likelihood( s.old_s1, s.old_s2, s.s1, s.s2, s.a) l = self.model.likelihood.get_likelihood(s) # likelihood_list[episode] += np.abs(real_likelihood - l.detach().numpy()) likelihood_list[episode] += l.detach().numpy() reward_list[episode] += reward old_s1, old_s2 = s.s1, s.s2 nb_step += 1 likelihood_list[episode] /= nb_step return likelihood_list, reward_list
def monte_carlo(iterations=1000000, policy=policies.epsilon_greedy, n_zero=100): """ Performs Monte Carlo control in the Easy21 game. :param iterations: number of monte carlo iterations :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :return: value function and the plot of the optimal value function """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # number of wins wins = 0 print('Iterations completed:') for i in xrange(iterations): if (i % 500000) == 0: print(i) # create a new random starting state state = environment.State() # play a round observed_keys = [] while not state.terminal: player = state.player_sum dealer = state.dealer_first_card # find an action defined by the policy epsilon = n_zero / float(n_zero + counter_state[(player, dealer)]) action = policy(epsilon, value_function, state) observed_keys.append((player, dealer, action)) # take a step [state, reward] = environment.step(state, action) # we have reached an end of episode if reward is not None: # update over all keys for key in observed_keys: # update counts counter_state[key[:-1]] += 1 counter_state_action[key] += 1 # update value function alpha = 1.0 / counter_state_action[key] value_function[key] += alpha * (reward - value_function[key]) if reward == 1: wins += 1 print('Wins: %.4f%%' % ((float(wins) / iterations) * 100)) # plot the optimal value function plotting.plot_value_function(value_function) return value_function
def train_with_buffer(env, likelihood, nb_episode): s = environment.State() nb_total_step = nb_episode * env.max_step buffer = [] l_a2b = np.zeros(nb_total_step) l_b2a = np.zeros(nb_total_step) # fill buffer for episode in range(nb_episode): done = False old_s1, old_s2 = env.reset() while not done: a = env.sample_action_uniformly() (s1, s2), reward, done, _ = env.step(a, old_s1, old_s2) s.set_state(old_s1, old_s2, s1, s2, a) buffer.append(copy.copy(s)) old_s1, old_s2 = s1, s2 shuffle(buffer) # train for i in range(nb_total_step): s = buffer[i] l1, l2 = likelihood.update(s) real_likelihood = env.get_likelihood(s.old_s1, s.old_s2, s.s1, s.s2, s.a) l_a2b[i] = np.abs(real_likelihood - l1.detach().numpy()) l_b2a[i] = np.abs(real_likelihood - l2.detach().numpy()) # l_a2b[i] = l1.detach().numpy() # l_b2a[i] = l2.detach().numpy() return l_a2b, l_b2a
def calc_features_matrix(): all_features = np.zeros( (10, 21, 3, 6)) # dealer card - 1, players sum - 1, features 3x6 for i in range(10): for j in range(21): state = environment.State(i + 1, j + 1) all_features[i, j, :, :] = state.get_features() return all_features
def _learn(self, old_s1, old_s2): s = environment.State() a = self.q_learning.sample_action(old_s1, old_s2) (s1, s2), reward, done, _ = self.env.step(a, old_s1, old_s2) s.set_state(old_s1, old_s2, s1, s2, a) self.q_learning.update(s, reward) self.model.update(s, reward) return s, reward, done
def simulate(self): s = environment.State() old_s1, old_s2, a = self.model.sample_observation() s1, s2, reward, confidence_level = self.model.simulate( old_s1, old_s2, a, self.env) #TODO: remove self.env s.set_state(old_s1, old_s2, s1, s2, a) if confidence_level > self.confidence_threshold: self.q_learning.update(s, reward)
def q_test() -> bool: environment_ = environment.Environment(grid_=data.GRID_1, rng=rng) q = StateActionFunction(environment_) state_ = environment.State(common.XY(x=4, y=2)) action_ = environment.Action(common.XY(x=1, y=0)) print(q[state_, action_]) q[state_, action_] = 2.0 q[state_, action_] += 0.5 print(q[state_, action_]) return True
def plot_q(self): (x, y, z) = ([], [], []) for d in range(1, 10): for p in range(1, 21): state = environment.State( environment.Card(environment.COLOR_BLACK, d), p) x.append(float(d)) y.append(float(p)) value = max( self.evaluate_model(state, environment.ACTION_HIT), self.evaluate_model(state, environment.ACTION_STICK)) z.append(value) fig = plt.figure() ax = fig.gca(projection='3d') ax.scatter(np.array(x), np.array(y), np.array(z), linewidth=1, antialiased=False) plt.show()
def environment_test() -> bool: environment_ = environment.Environment(grid_=data.GRID_1, rng=rng) for state_ in environment_.states(): print(state_) print() for action_ in environment_.actions(): print(action_) print() state_ = environment.State(common.XY(x=4, y=2)) action_ = environment.Action(common.XY(x=1, y=0)) response_ = environment_.from_state_perform_action(state_, action_) print(state_, action_) print(response_) return True
def train(env, likelihood, nb_episode): s = environment.State() l_a2b = np.zeros(nb_episode) l_b2a = np.zeros(nb_episode) for episode in range(nb_episode): done = False old_s1, old_s2 = env.reset() while not done: a = env.sample_action_uniformly() (s1, s2), reward, done, _ = env.step(a, old_s1, old_s2) s.set_state(old_s1, old_s2, s1, s2, a) l1, l2 = likelihood.update(s) real_likelihood = env.get_likelihood(old_s1, old_s2, s1, s2, a) l_a2b[episode] += np.abs(real_likelihood - l1.detach().numpy()) l_b2a[episode] += np.abs(real_likelihood - l2.detach().numpy()) old_s1, old_s2 = s1, s2 l_a2b[episode] /= 100 l_b2a[episode] /= 100 return l_a2b, l_b2a
def train_with_buffer(self, nb_episode, nb_simulation, buffer_size=None): s = environment.State() if self.model is not None: self.model.reset_observation() if buffer_size is None: buffer_size = int(nb_episode / 10) print(f'buffer_size: {buffer_size}') buffer = self._fill_buffer(buffer_size) likelihood_list = np.zeros(nb_episode) reward_list = np.zeros(nb_episode) episode = 0 for _ in range(10): for i in range(buffer_size): s = buffer[i] new = np.random.randint(10) if new < 1: old_s1, old_s2 = self.env.reset() else: old_s1, old_s2 = s.old_s1, s.old_s2 s, reward, done = self._learn(s.old_s1, s.old_s2) buffer[i] = s if self.model is not None: for _ in range(nb_simulation): self.simulate() real_likelihood = self.env.get_likelihood( s.old_s1, s.old_s2, s.s1, s.s2, s.a) l = self.model.likelihood.get_likelihood(s) likelihood_list[episode] = np.abs(real_likelihood - l.detach().numpy()) reward_list[episode] = reward episode += 1 shuffle(buffer) return likelihood_list, reward_list
def simulate(self, old_s1, old_s2, a, env): self.simulation_total += 1 prob = np.zeros((self.state_dim * self.state_dim)) state = environment.State() #TODO: keep values for same step... for s1 in range(self.state_dim): for s2 in range(self.state_dim): state.set_state(old_s1, old_s2, s1, s2, a) l = self.likelihood.get_likelihood(state).detach().numpy() prob[s1 + s2 * self.state_dim] = np.exp(l) if np.sum(prob) != 1: prob = prob / np.sum(prob) s = np.random.choice(np.arange(prob.shape[0]), p=prob) s1 = s % self.state_dim s2 = s // self.state_dim confidence_level = np.amax(prob) if confidence_level > 0.5: print(f'Prob max:{np.amax(prob)}') print(f'simulation result: ({s1}, {s2})') print(env.step(a, old_s1, old_s2)) # __import__('ipdb').set_trace() reward = self.sample_reward(s1, s2, a) return s1, s2, reward, confidence_level
def sarsa_episode(lam): # reset all eligibility traces state_info[:, :, e_hit_index] = 0 state_info[:, :, e_stick_index] = 0 # initialize the state S dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) features = state.get_features() # initialize the action A action = environment.Action.HIT if random.random() < 0.5: action = environment.Action.STICK # run one episode while not state.terminated: # take the action A state_new = environment.step(state, action) reward = state_new.reward features_new = state_new.get_features() # pick the next action A' by using epsilon greedy action_new = None if state_new.terminated: action_new = environment.Action.NONE else: if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action_new = environment.Action.HIT else: action_new = environment.Action.STICK else: # pick the action greedily (largest action value) v_hit = np.sum( np.multiply(features_new, state_info[:, :, q_hit_index])) v_stick = np.sum( np.multiply(features_new, state_info[:, :, q_stick_index])) if v_hit > v_stick: action_new = environment.Action.HIT else: action_new = environment.Action.STICK # calculate delta if action == environment.Action.HIT: q_value = np.sum( np.multiply(features, state_info[:, :, q_hit_index])) else: q_value = np.sum( np.multiply(features, state_info[:, :, q_stick_index])) if state_new.terminated: q_value_new = 0 else: if action_new == environment.Action.HIT: q_value_new = np.sum( np.multiply(features_new, state_info[:, :, q_hit_index])) else: q_value_new = np.sum( np.multiply(features_new, state_info[:, :, q_stick_index])) delta = reward + q_value_new - q_value # increment eligibility trace if action == environment.Action.HIT: state_info[:, :, e_hit_index] += features else: state_info[:, :, e_stick_index] += features # update all values state_info[:, :, q_hit_index] += alpha * delta * state_info[:, :, e_hit_index] state_info[:, :, q_stick_index] += alpha * delta * state_info[:, :, e_stick_index] # update all eligibility traces state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index] state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index] # end this step state = state_new action = action_new features = features_new
def mc_episode(): states = [] # holds all states of one episode actions = [] # holds all actions of one episode # create the initial state dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) # create the initial state while not state.terminated: states.append(state) # define the indices for the state matrix dealer_state_index = state.dealer_card - 1 player_state_index = state.player_sum - 1 # pick the action epsilon = n0 / ( n0 + state_info[dealer_state_index, player_state_index, ns_index]) if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action = environment.Action.HIT else: action = environment.Action.STICK else: # pick the action greedily (largest action value) if state_info[dealer_state_index, player_state_index, q_hit_index] > state_info[dealer_state_index, player_state_index, q_stick_index]: action = environment.Action.HIT else: action = environment.Action.STICK # increment the counts state_info[dealer_state_index, player_state_index, ns_index] += 1 if action == environment.Action.HIT: state_info[dealer_state_index, player_state_index, ns_hit_index] += 1 if action == environment.Action.STICK: state_info[dealer_state_index, player_state_index, ns_stick_index] += 1 # get a new state actions.append(action) state = environment.step(state, action) # update the action values for i in range(0, len(states)): s = states[i] a = actions[i] tot_reward = state.reward if not s.is_busted: dealer_state_index = s.dealer_card - 1 player_state_index = s.player_sum - 1 if a == environment.Action.HIT: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_hit_index] value = state_info[dealer_state_index, player_state_index, q_hit_index] state_info[dealer_state_index, player_state_index, q_hit_index] += alpha * (tot_reward - value) else: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_stick_index] value = state_info[dealer_state_index, player_state_index, q_stick_index] state_info[dealer_state_index, player_state_index, q_stick_index] += alpha * (tot_reward - value)
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Value function approximation using coarse coding :param l: lambda parameter :param gamma: discounting rate :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # weights vector for the state_action feature vector theta = np.random.random(36) * 0.2 # random move probability epsilon = 0.05 # step-size parameter alpha = 0.01 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # key is state_action feature vector eligibility_trace = np.zeros(36) # initial state, action [SA..], and set of features state = environment.State() # calculate features for the given state state_features_current = utilities.get_state_features(state) # get action from this state q_a_current, action_current = policy(epsilon, theta, state_features_current) # calculate final state, action feature vector features_current = utilities.get_state_action_features( state_features_current, action_current) while not state.terminal: # update eligibility trace (accumulating) eligibility_trace = np.add(eligibility_trace, features_current) # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] state_features_next = utilities.get_state_features(state) q_a_next, action_next = policy(epsilon, theta, state_features_next) features_next = utilities.get_state_action_features( state_features_next, action_next) # calculate state value difference delta = reward + gamma * q_a_next - q_a_current # update weights theta = np.add(theta, alpha * delta * eligibility_trace) # update trace eligibility_trace *= gamma * l features_current = features_next action_current = action_next # calculate value function value_function = defaultdict(float) for player in xrange(1, 22): for dealer in xrange(1, 11): for action in [0, 1]: s = environment.State(dealer, player) phi = utilities.get_state_action_features( utilities.get_state_features(s), action) value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta) # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append( (episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curves if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=( learning_curve, l, )) p.start() else: plotting.plot_learning_curve(learning_curve, l) return value_function
def sarsa_lambda(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Applies eligibility trace version of Sarsa to the game Easy21 :param l: lambda parameter :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param gamma: discounting rate :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1 :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # no. of wins to calculate the percentage of wins at the end wins = 0 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # current (player, dealer, action) eligibility_trace = defaultdict(float) # initial state, action [SA..] state = environment.State() player_current = state.player_sum dealer_current = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[ (player_current, dealer_current)]) action_current = policy(epsilon, value_function, state) while not state.terminal: # update counts counter_state[(player_current, dealer_current)] += 1 counter_state_action[(player_current, dealer_current, action_current)] += 1 # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] player_next = state.player_sum dealer_next = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[(player_next, dealer_next)]) action_next = policy(epsilon, value_function, state) delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \ value_function[(player_current, dealer_current, action_current)] alpha = 1.0 / counter_state_action[(player_current, dealer_current, action_current)] eligibility_trace[(player_current, dealer_current, action_current)] += 1 # update the values for key in value_function: value_function[key] += alpha * delta * eligibility_trace[key] eligibility_trace[key] *= gamma * l player_current = player_next dealer_current = dealer_next action_current = action_next # use it later to calculate the percentage of wins if reward == 1: wins += 1 # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append( (episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curve if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=( learning_curve, l, )) p.start() else: plotting.plot_learning_curve(learning_curve, l) # get the percentage of wins print float(wins) / max_episodes return value_function
def sarsa_episode(lam): """ executes one sarsa episode :param lam: the lambda parameter :return: """ # reset all eligibility traces state_info[:, :, e_hit_index] = 0 state_info[:, :, e_stick_index] = 0 # initialize the state S dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) # initialize the action A action = environment.Action.HIT if random.random() < 0.5: action = environment.Action.STICK # run one episode while not state.terminated: # define the starting state indices for the state matrix dealer_state_index = state.dealer_card - 1 player_state_index = state.player_sum - 1 # take the action A state_new = environment.step(state, action) reward = state_new.reward # define the indices of the new state dealer_state_index_new = state_new.dealer_card - 1 player_state_index_new = state_new.player_sum - 1 # pick the next action A' by using epsilon greedy if state_new.terminated: action_new = environment.Action.NONE else: epsilon = n0 / (n0 + state_info[dealer_state_index_new, player_state_index_new, ns_index]) if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action_new = environment.Action.HIT else: action_new = environment.Action.STICK else: # pick the action greedily (largest action value) if state_info[dealer_state_index_new, player_state_index_new, q_hit_index] > state_info[dealer_state_index_new, player_state_index_new, q_stick_index]: action_new = environment.Action.HIT else: action_new = environment.Action.STICK # increment the counts state_info[dealer_state_index, player_state_index, ns_index] += 1 if action == environment.Action.HIT: state_info[dealer_state_index, player_state_index, ns_hit_index] += 1 if action == environment.Action.STICK: state_info[dealer_state_index, player_state_index, ns_stick_index] += 1 # calculate delta if action == environment.Action.HIT: qValue = state_info[dealer_state_index, player_state_index, q_hit_index] else: qValue = state_info[dealer_state_index, player_state_index, q_stick_index] if state_new.terminated: q_value_new = 0 else: if action_new == environment.Action.HIT: q_value_new = state_info[dealer_state_index_new, player_state_index_new, q_hit_index] else: q_value_new = state_info[dealer_state_index_new, player_state_index_new, q_stick_index] delta = reward + q_value_new - qValue # increment eligibility trace alpha = None if action == environment.Action.HIT: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_hit_index] state_info[dealer_state_index, player_state_index, e_hit_index] += 1 else: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_stick_index] state_info[dealer_state_index, player_state_index, e_stick_index] += 1 # update all values state_info[:, :, q_hit_index] += alpha * delta * state_info[:, :, e_hit_index] state_info[:, :, q_stick_index] += alpha * delta * state_info[:, :, e_stick_index] # update all eligibility traces state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index] state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index] # end this step state = state_new action = action_new