def main(): parameter = get_args() agent, environment = build_objects() # At first, the agent is exploring agent.exploring = True #Executes the number of training steps specified in the -t parameter for step in range(parameter.training_steps): #The first step is to define the current state state = environment.get_state() #The agent selects the action according to the state action = agent.select_action(state) #The state transition is processed statePrime, action, reward = environment.step(action) #The agent Q-update is performed agent.observe_reward(state, action, statePrime, reward) print("***Training step " + str(step + 1) + " Completed") #Now that the training has finished, the agent can use his policy without updating it agent.exploring = False # Executes the number of evaluation steps specified in the -e parameter for step in range(parameter.evaluation_steps): #Mostly the same as training, but without observing the rewards #The first step is to define the current state state = environment.get_state() #The agent selects the action according to the state action = agent.select_action(state) #The state transition is processed environment.step(action) print("***Evaluation step " + str(step + 1) + " Completed")
def monte_carlo(iterations=1000000, policy=policies.epsilon_greedy, n_zero=100): """ Performs Monte Carlo control in the Easy21 game. :param iterations: number of monte carlo iterations :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :return: value function and the plot of the optimal value function """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # number of wins wins = 0 print('Iterations completed:') for i in xrange(iterations): if (i % 500000) == 0: print(i) # create a new random starting state state = environment.State() # play a round observed_keys = [] while not state.terminal: player = state.player_sum dealer = state.dealer_first_card # find an action defined by the policy epsilon = n_zero / float(n_zero + counter_state[(player, dealer)]) action = policy(epsilon, value_function, state) observed_keys.append((player, dealer, action)) # take a step [state, reward] = environment.step(state, action) # we have reached an end of episode if reward is not None: # update over all keys for key in observed_keys: # update counts counter_state[key[:-1]] += 1 counter_state_action[key] += 1 # update value function alpha = 1.0 / counter_state_action[key] value_function[key] += alpha * (reward - value_function[key]) if reward == 1: wins += 1 print('Wins: %.4f%%' % ((float(wins) / iterations) * 100)) # plot the optimal value function plotting.plot_value_function(value_function) return value_function
def step(): # what happens in each cycle. Main calls happen here. global count_steps count_steps += 1 print "count_steps "+str(count_steps) nodes = environment.step() wakeup_nodes(nodes) # calls their behaviour, which calls the policy (what to do), and finally applies actions print "killed edges: "+str(stats.collector.num_kills) stats.snapshot() print "total graph weight:"+str(stats.collector.total_weight) stats.new_collector()
def watch_play(self): done = False board = env.reset() while not done: # finds the best action action = env.process_state(board) self.drop_piece(action, board) board, done = env.step(board, *action) self.board = board.area self.update() self.root.update() time.sleep(self.speed)
def mc_control(num_episodes=10000): q_sa = {} p = {} n_s = {} n_sa = {} n0 = 100 for _ in range(num_episodes): state = State() reward = 0 episode_s = [] episode_sa = [] while not state.terminal: s = state.as_tuple() if s in p: a = sample_action(p[s]) else: a = Action.random() episode_s.append(s) episode_sa.append(s + (a, )) state, reward = step(state, a) ns = n_s.get(s, 0) n_s[s] = ns + 1 sa = s + (a, ) nsa = n_sa.get(sa, 0) n_sa[sa] = nsa + 1 # GLIE MC Control for sa in set(episode_sa): nsa = n_sa[sa] qsa = q_sa.get(sa, 0) q_sa[sa] = qsa + ((reward - qsa) / nsa) # Improve policy for s in set(episode_s): a_best = greedy_action(q_sa, s) ns = n_s.get(s, 0) epsilon = n0 / (n0 + ns) selection_probs = [] for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + epsilon / len(Action)) else: selection_probs.append(epsilon / len(Action)) p[s] = selection_probs return q_sa
def iteration(): # what happens in each cycle. Main calls happen here. print "iteration!" global count_iterations print "count_iterations "+str(count_iterations) nodes = environment.step() wakeup_nodes(nodes) # calls their behaviour, which calls the policy (what to do), and finally applies actions print "killed edges: "+str(stats.collector.num_kills) stats.snapshot() print "total graph weight:"+str(stats.collector.total_weight) print "average path length, see note1:"+ str(stats.collector.path_length) print "mean edge importance:"+str(stats.collector.mean_edges_importance) print "std edge importance:"+str(stats.collector.std_edges_importance) count_iterations +=1 stats.new_collector()
def get_sample(env, q, epsilon): s = env["start_state"] h_s = [s] h_a = [] h_r = [] over = False step = 0 max_step = env["n_row"] * env["n_col"] * 3 while not over and step < max_step: step += 1 a = epsilon_greedy_select(q[s], epsilon) s, r, over = environment.step(env, s, a) h_a.append(a) h_r.append(r) h_s.append(s) if step == max_step: h_r[len(h_r) - 1] = -1000 return h_s, h_a, h_r
def sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, yield_progress=False): q_sa = {} n_s = {} n_sa = {} for n in range(num_episodes): e_sa = {} state = State() s = state.as_tuple() a = epsilon_greedy_action(q_sa, s, calculate_epsilon(n_s, s)) while not state.terminal: state, reward = step(state, a) n_s[s] = n_s.get(s, 0) + 1 s_next = state.as_tuple() a_next = epsilon_greedy_action(q_sa, s_next, calculate_epsilon(n_s, s_next)) sa = s + (a, ) sa_next = s_next + (a_next, ) qsa = q_sa.get(sa, 0) qsa_next = q_sa.get(sa_next, 0) nsa = n_sa.get(sa, 0) + 1 n_sa[sa] = nsa delta = reward + gamma * qsa_next - qsa e_sa[sa] = e_sa.get(sa, 0) + 1 for (s, a) in generate_all_state_action_pairs(): sa = s + (a, ) q_sa[sa] = q_sa.get(sa, 0) + (delta * e_sa.get(sa, 0)) / nsa e_sa[sa] = gamma * lamba * e_sa.get(sa, 0) s = s_next a = a_next if yield_progress: yield n + 1, q_sa if not yield_progress: yield num_episodes, q_sa
def get_sample_and_learn_online(env, q, bootstrap, discount, step_size, epsilon): s = env["start_state"] h_s = [s] h_a = [] h_r = [] over = False step = 0 error = 0 while not over: step += 1 a = epsilon_greedy_select(q[s], epsilon) s_next, r, over = environment.step(env, s, a) if not over: if bootstrap == BOOTSTRAP_SARSA: a_next = epsilon_greedy_select(q[s_next], epsilon) q_next = q[s_next, a_next] elif bootstrap == BOOTSTRAP_EXPECTED: q_next = expected_q(q[s_next], epsilon) elif bootstrap == BOOTSTRAP_Q: q_next = max(q[s_next]) else: q_next = 0 delta = r + discount * q_next - q[s, a] q[s, a] = q[s, a] + step_size * delta s = s_next h_a.append(a) h_r.append(r) h_s.append(s) error += abs(step_size * delta) return (h_s, h_a, h_r), error
def mc_episode(): states = [] # holds all states of one episode actions = [] # holds all actions of one episode # create the initial state dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) # create the initial state while not state.terminated: states.append(state) # define the indices for the state matrix dealer_state_index = state.dealer_card - 1 player_state_index = state.player_sum - 1 # pick the action epsilon = n0 / ( n0 + state_info[dealer_state_index, player_state_index, ns_index]) if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action = environment.Action.HIT else: action = environment.Action.STICK else: # pick the action greedily (largest action value) if state_info[dealer_state_index, player_state_index, q_hit_index] > state_info[dealer_state_index, player_state_index, q_stick_index]: action = environment.Action.HIT else: action = environment.Action.STICK # increment the counts state_info[dealer_state_index, player_state_index, ns_index] += 1 if action == environment.Action.HIT: state_info[dealer_state_index, player_state_index, ns_hit_index] += 1 if action == environment.Action.STICK: state_info[dealer_state_index, player_state_index, ns_stick_index] += 1 # get a new state actions.append(action) state = environment.step(state, action) # update the action values for i in range(0, len(states)): s = states[i] a = actions[i] tot_reward = state.reward if not s.is_busted: dealer_state_index = s.dealer_card - 1 player_state_index = s.player_sum - 1 if a == environment.Action.HIT: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_hit_index] value = state_info[dealer_state_index, player_state_index, q_hit_index] state_info[dealer_state_index, player_state_index, q_hit_index] += alpha * (tot_reward - value) else: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_stick_index] value = state_info[dealer_state_index, player_state_index, q_stick_index] state_info[dealer_state_index, player_state_index, q_stick_index] += alpha * (tot_reward - value)
# Initialize environment, get initial state and reward state, reward = env.reset() # Simulate for training_length steps for i in range(params.training_length): # Run network for 50 ms: Get left and right output spikes, get weights n_l, n_r, weights = snn.simulate(state, reward) w_l = weights[0] w_r = weights[1] # Perform a step # Get state, distance, pos_data, reward, terminate, steps, # travelled_distances, vrep_steps (state, distance, pos_data, reward, t, step, travelled_distances, vrep_steps) = env.step(n_l, n_r) # Save weights every 100 simulation steps if i % 10 == 0: weights_l.append(w_l) weights_r.append(w_r) weights_i.append(i) # Store distance, position, reward, step distances.append(distance) positions.append(pos_data) rewards.append(reward) steps.append(step) # Save # steps after the training resets if t:
q_table = np.load("./qtables/1/800-qtable.npy", allow_pickle=True).item() for episode in range(EPISODES): state = env.reset() if state not in q_table: q_table[state] = np.random.uniform( low=-2, high=0, size=env.action_space_n) episode_reward = 0 done = False while not done: valid_actions = env.get_valid_actions(0) action = max(valid_actions, key=lambda a: q_table[state][a]) new_state, reward, done = env.step(action) episode_reward += reward if new_state not in q_table: q_table[new_state] = np.random.uniform( low=-2, high=0, size=env.action_space_n) print(reward, new_state, action) # If simulation did not end yet after last step - update Q table if not done: # Maximum possible Q value in next step (for new state) max_future_q = np.max(q_table[new_state]) # Current Q value (for current state and performed action)
def sarsa_episode(lam): # reset all eligibility traces state_info[:, :, e_hit_index] = 0 state_info[:, :, e_stick_index] = 0 # initialize the state S dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) features = state.get_features() # initialize the action A action = environment.Action.HIT if random.random() < 0.5: action = environment.Action.STICK # run one episode while not state.terminated: # take the action A state_new = environment.step(state, action) reward = state_new.reward features_new = state_new.get_features() # pick the next action A' by using epsilon greedy action_new = None if state_new.terminated: action_new = environment.Action.NONE else: if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action_new = environment.Action.HIT else: action_new = environment.Action.STICK else: # pick the action greedily (largest action value) v_hit = np.sum( np.multiply(features_new, state_info[:, :, q_hit_index])) v_stick = np.sum( np.multiply(features_new, state_info[:, :, q_stick_index])) if v_hit > v_stick: action_new = environment.Action.HIT else: action_new = environment.Action.STICK # calculate delta if action == environment.Action.HIT: q_value = np.sum( np.multiply(features, state_info[:, :, q_hit_index])) else: q_value = np.sum( np.multiply(features, state_info[:, :, q_stick_index])) if state_new.terminated: q_value_new = 0 else: if action_new == environment.Action.HIT: q_value_new = np.sum( np.multiply(features_new, state_info[:, :, q_hit_index])) else: q_value_new = np.sum( np.multiply(features_new, state_info[:, :, q_stick_index])) delta = reward + q_value_new - q_value # increment eligibility trace if action == environment.Action.HIT: state_info[:, :, e_hit_index] += features else: state_info[:, :, e_stick_index] += features # update all values state_info[:, :, q_hit_index] += alpha * delta * state_info[:, :, e_hit_index] state_info[:, :, q_stick_index] += alpha * delta * state_info[:, :, e_stick_index] # update all eligibility traces state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index] state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index] # end this step state = state_new action = action_new features = features_new
cumulated_reward = 0 #Should going forward give more reward then L/R ? observation = environment.reset() if qlearn.epsilon > 0.05: qlearn.epsilon *= epsilon_discount state = ''.join(map(str, observation)) # print("State = ",state," observation = ",observation) for i in range(1500): # Pick an action based on the current state action = qlearn.chooseAction(state) # Execute the action and get feedback observation, reward, done, info = environment.step(action) cumulated_reward += reward if highest_reward < cumulated_reward: highest_reward = cumulated_reward nextState = ''.join(map(str, observation)) qlearn.learn(state, action, reward, nextState) # environment.monitor.flush(force=True) print(i, " S= ", state, " A = ", action, 'observation = ', observation) if not (done): state = nextState else:
def sarsa_lambda(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Applies eligibility trace version of Sarsa to the game Easy21 :param l: lambda parameter :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param gamma: discounting rate :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1 :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # no. of wins to calculate the percentage of wins at the end wins = 0 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # current (player, dealer, action) eligibility_trace = defaultdict(float) # initial state, action [SA..] state = environment.State() player_current = state.player_sum dealer_current = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[(player_current, dealer_current)]) action_current = policy(epsilon, value_function, state) while not state.terminal: # update counts counter_state[(player_current, dealer_current)] += 1 counter_state_action[(player_current, dealer_current, action_current)] += 1 # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] player_next = state.player_sum dealer_next = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[(player_next, dealer_next)]) action_next = policy(epsilon, value_function, state) delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \ value_function[(player_current, dealer_current, action_current)] alpha = 1.0 / counter_state_action[(player_current, dealer_current, action_current)] eligibility_trace[(player_current, dealer_current, action_current)] += 1 # update the values for key in value_function: value_function[key] += alpha * delta * eligibility_trace[key] eligibility_trace[key] *= gamma * l player_current = player_next dealer_current = dealer_next action_current = action_next # use it later to calculate the percentage of wins if reward == 1: wins += 1 # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curve if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,)) p.start() else: plotting.plot_learning_curve(learning_curve, l) # get the percentage of wins print float(wins) / max_episodes return value_function
step = 0 ep = 0 while step < maxSteps: ep += 1 x = environment.reset() # initialize the state C = 0. done = False t = 1 while not done: t += 1 step += 1 a = agent.action(x) u = Actions[a] #env.render() # only for visual effects x_next, c, done = environment.step(u, x) C += (1. / t) * (c - C) agent.update(x, a, c, x_next, done) x = x_next if done: break if step >= maxSteps: break R.append(C) print('Episode:', ep, 'Total Steps:', step, ', Ave. Reward/Power :', c, ', Episode Length:', t - 1)
def sarsa_episode(lam): """ executes one sarsa episode :param lam: the lambda parameter :return: """ # reset all eligibility traces state_info[:, :, e_hit_index] = 0 state_info[:, :, e_stick_index] = 0 # initialize the state S dealer_card = random.randint(1, 10) player_card = random.randint(1, 10) state = environment.State(dealer_card, player_card) # initialize the action A action = environment.Action.HIT if random.random() < 0.5: action = environment.Action.STICK # run one episode while not state.terminated: # define the starting state indices for the state matrix dealer_state_index = state.dealer_card - 1 player_state_index = state.player_sum - 1 # take the action A state_new = environment.step(state, action) reward = state_new.reward # define the indices of the new state dealer_state_index_new = state_new.dealer_card - 1 player_state_index_new = state_new.player_sum - 1 # pick the next action A' by using epsilon greedy if state_new.terminated: action_new = environment.Action.NONE else: epsilon = n0 / (n0 + state_info[dealer_state_index_new, player_state_index_new, ns_index]) if random.random() < epsilon: # exploration, pick a random action if random.random() < 0.5: action_new = environment.Action.HIT else: action_new = environment.Action.STICK else: # pick the action greedily (largest action value) if state_info[dealer_state_index_new, player_state_index_new, q_hit_index] > state_info[dealer_state_index_new, player_state_index_new, q_stick_index]: action_new = environment.Action.HIT else: action_new = environment.Action.STICK # increment the counts state_info[dealer_state_index, player_state_index, ns_index] += 1 if action == environment.Action.HIT: state_info[dealer_state_index, player_state_index, ns_hit_index] += 1 if action == environment.Action.STICK: state_info[dealer_state_index, player_state_index, ns_stick_index] += 1 # calculate delta if action == environment.Action.HIT: qValue = state_info[dealer_state_index, player_state_index, q_hit_index] else: qValue = state_info[dealer_state_index, player_state_index, q_stick_index] if state_new.terminated: q_value_new = 0 else: if action_new == environment.Action.HIT: q_value_new = state_info[dealer_state_index_new, player_state_index_new, q_hit_index] else: q_value_new = state_info[dealer_state_index_new, player_state_index_new, q_stick_index] delta = reward + q_value_new - qValue # increment eligibility trace alpha = None if action == environment.Action.HIT: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_hit_index] state_info[dealer_state_index, player_state_index, e_hit_index] += 1 else: alpha = 1 / state_info[dealer_state_index, player_state_index, ns_stick_index] state_info[dealer_state_index, player_state_index, e_stick_index] += 1 # update all values state_info[:, :, q_hit_index] += alpha * delta * state_info[:, :, e_hit_index] state_info[:, :, q_stick_index] += alpha * delta * state_info[:, :, e_stick_index] # update all eligibility traces state_info[:, :, e_hit_index] = lam * state_info[:, :, e_hit_index] state_info[:, :, e_stick_index] = lam * state_info[:, :, e_stick_index] # end this step state = state_new action = action_new
x = np.array([]) y = np.array([]) win_array = np.array([]) win_sum = 0 #agent.load('final_agent') for cnt2 in range(1): turny = 0 win = 0 print("試行回数" + str(cnt2+1)) for cnt in range(1): turn = 0 while not done: action = agent.act_and_train(obs, r) obs, r, done, info = env.step(action) turn += 1 if r == 10: win += 1 turny += turn agent.stop_episode_and_train(obs, r, done) obs = env.reset() r = 0 done = False x = np.append(x, cnt2) y = np.append(y, turny/1000) win_array = np.append(win_array, win) win_sum += win
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Value function approximation using coarse coding :param l: lambda parameter :param gamma: discounting rate :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # weights vector for the state_action feature vector theta = np.random.random(36) * 0.2 # random move probability epsilon = 0.05 # step-size parameter alpha = 0.01 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # key is state_action feature vector eligibility_trace = np.zeros(36) # initial state, action [SA..], and set of features state = environment.State() # calculate features for the given state state_features_current = utilities.get_state_features(state) # get action from this state q_a_current, action_current = policy(epsilon, theta, state_features_current) # calculate final state, action feature vector features_current = utilities.get_state_action_features( state_features_current, action_current) while not state.terminal: # update eligibility trace (accumulating) eligibility_trace = np.add(eligibility_trace, features_current) # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] state_features_next = utilities.get_state_features(state) q_a_next, action_next = policy(epsilon, theta, state_features_next) features_next = utilities.get_state_action_features( state_features_next, action_next) # calculate state value difference delta = reward + gamma * q_a_next - q_a_current # update weights theta = np.add(theta, alpha * delta * eligibility_trace) # update trace eligibility_trace *= gamma * l features_current = features_next action_current = action_next # calculate value function value_function = defaultdict(float) for player in xrange(1, 22): for dealer in xrange(1, 11): for action in [0, 1]: s = environment.State(dealer, player) phi = utilities.get_state_action_features( utilities.get_state_features(s), action) value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta) # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append( (episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curves if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=( learning_curve, l, )) p.start() else: plotting.plot_learning_curve(learning_curve, l) return value_function
def linear_function_approximation(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy_lfa, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Value function approximation using coarse coding :param l: lambda parameter :param gamma: discounting rate :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # weights vector for the state_action feature vector theta = np.random.random(36)*0.2 # random move probability epsilon = 0.05 # step-size parameter alpha = 0.01 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # key is state_action feature vector eligibility_trace = np.zeros(36) # initial state, action [SA..], and set of features state = environment.State() # calculate features for the given state state_features_current = utilities.get_state_features(state) # get action from this state q_a_current, action_current = policy(epsilon, theta, state_features_current) # calculate final state, action feature vector features_current = utilities.get_state_action_features(state_features_current, action_current) while not state.terminal: # update eligibility trace (accumulating) eligibility_trace = np.add(eligibility_trace, features_current) # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] state_features_next = utilities.get_state_features(state) q_a_next, action_next = policy(epsilon, theta, state_features_next) features_next = utilities.get_state_action_features(state_features_next, action_next) # calculate state value difference delta = reward + gamma * q_a_next - q_a_current # update weights theta = np.add(theta, alpha * delta * eligibility_trace) # update trace eligibility_trace *= gamma * l features_current = features_next action_current = action_next # calculate value function value_function = defaultdict(float) for player in xrange(1, 22): for dealer in xrange(1, 11): for action in [0, 1]: s = environment.State(dealer, player) phi = utilities.get_state_action_features(utilities.get_state_features(s), action) value_function[(s.player_sum, s.dealer_first_card, action)] = phi.dot(theta) # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append((episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curves if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=(learning_curve, l,)) p.start() else: plotting.plot_learning_curve(learning_curve, l) return value_function
def sarsa_lambda(l=0.9, max_episodes=1000, policy=policies.epsilon_greedy, n_zero=100, gamma=1, plot_learning_curve=True, multiproc=True): """ Applies eligibility trace version of Sarsa to the game Easy21 :param l: lambda parameter :param max_episodes: stop learning after this many episodes :param policy: exploration strategy to use :param n_zero: epsilon greedy constant (only applicable if epsilon greedy policy is used) :param gamma: discounting rate :param plot_learning_curve: whether to turn on plotting of learning curve for lambda = 0 and 1 :param multiproc: whether to use multiprocessing when doing plots or not (warning! turn off if running multiple algorithms on mac or windows simultaneously) :return: value function after max_episodes """ # (player, dealer, action) key value_function = defaultdict(float) # (player, dealer) key counter_state = defaultdict(int) # (player, dealer, action) key counter_state_action = defaultdict(int) # no. of wins to calculate the percentage of wins at the end wins = 0 # learning curve plotting if l in {0, 1} and plot_learning_curve: learning_curve = [] try: mc_values = pickle.load(open("Data/MC_value_function.pickle", "rb")) except: mc_values = monte_carlo(iterations=1000000) for episode in range(max_episodes): # current (player, dealer, action) eligibility_trace = defaultdict(float) # initial state, action [SA..] state = environment.State() player_current = state.player_sum dealer_current = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[ (player_current, dealer_current)]) action_current = policy(epsilon, value_function, state) while not state.terminal: # update counts counter_state[(player_current, dealer_current)] += 1 counter_state_action[(player_current, dealer_current, action_current)] += 1 # take a step, get reward [..R..] [state, reward] = environment.step(state, action_current) if reward is None: reward = 0 # follow up state, action [..SA] player_next = state.player_sum dealer_next = state.dealer_first_card epsilon = n_zero / float(n_zero + counter_state[(player_next, dealer_next)]) action_next = policy(epsilon, value_function, state) delta = reward + gamma * value_function[(player_next, dealer_next, action_next)] - \ value_function[(player_current, dealer_current, action_current)] alpha = 1.0 / counter_state_action[(player_current, dealer_current, action_current)] eligibility_trace[(player_current, dealer_current, action_current)] += 1 # update the values for key in value_function: value_function[key] += alpha * delta * eligibility_trace[key] eligibility_trace[key] *= gamma * l player_current = player_next dealer_current = dealer_next action_current = action_next # use it later to calculate the percentage of wins if reward == 1: wins += 1 # get the episode MSE for plotting learning curve if l in {0, 1} and plot_learning_curve: learning_curve.append( (episode, utilities.calculate_mse(mc_values, value_function))) # plot learning curve if l in {0, 1} and plot_learning_curve: if multiproc: # create a new process so computation can continue after plotting p = Process(target=plotting.plot_learning_curve, args=( learning_curve, l, )) p.start() else: plotting.plot_learning_curve(learning_curve, l) # get the percentage of wins print float(wins) / max_episodes return value_function
def anime(): environment.step() for i in range(0,number_ants): afficher(environment.population[i]) if active==True: fenetre.after(5,anime)
def expansion(self, a): '''Expands tree from current leaf node with action a. Returns expanded node''' s_prime, r, terminate = step(self.s, a) self.children[a] = Node(self.network, s=s_prime.reverse_player_positions(), parent=self, prev_a=a, prev_r=r, terminate=terminate) return self.children[a]
def lfa_sarsa_lambda(num_episodes=1000, lamba=0, gamma=1, alpha=0.01, yield_progress=False): # Set up the coarse codes, initial weights. action_codes = {} for action in list(Action): action_fns = [] for dealer_interval in [(1,4), (4,7), (7,10)]: for player_interval in [(1,6), (4,9), (7,12), (10,15), (13,18), (16,21)]: cuboid_fn = create_cuboid_fn(dealer_interval, player_interval, action) action_fns.append(cuboid_fn) action_codes[action] = action_fns def greedy(s, w): p, d = s action_values = [] for a in list(Action): value = 0 for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): value += w.get(cuboid_fn, 0) action_values.append((a, value)) action_values.sort(key=itemgetter(1), reverse=True) return action_values[0][0] def e_greedy(s, w, epsilon=0.05): a_best = greedy(s, w) selection_probs = [] default_p = epsilon / len(Action) for a in list(Action): if a is a_best: selection_probs.append(1 - epsilon + default_p) else: selection_probs.append(default_p) return sample_action(selection_probs) def f_sa(s, a): p, d = s for cuboid_fn in action_codes[a]: if cuboid_fn(p, d, a): yield cuboid_fn def compile_q_sa(w): q_sa = {} for (p, d), a in generate_all_state_action_pairs(): sa = (p, d, a) val = 0 for i in f_sa((p, d), a): val += w.get(i, 0) q_sa[sa] = val return q_sa w_f = {} for n in range(num_episodes): state = State() s = state.as_tuple() a = e_greedy(s, w_f) z_f = {} while not state.terminal: state, reward = step(state, a) delta = reward for i in f_sa(s, a): delta = delta - w_f.get(i, 0) z_f[i] = z_f.get(i, 0) + 1 if state.terminal: for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi break s_next = state.as_tuple() a_next = e_greedy(s_next, w_f) for i in f_sa(s_next, a_next): delta = delta + gamma * w_f.get(i, 0) for i, zi in z_f.items(): w_f[i] = w_f.get(i, 0) + alpha * delta * zi z_f[i] = gamma * lamba * zi s = s_next a = a_next if yield_progress: yield n+1, compile_q_sa(w_f) if not yield_progress: yield num_episodes, compile_q_sa(w_f)