if episode % 100 == 0: print '\repisode:', episode, # initialize the state state = environment.State() while state.terminal is False: dealer = state.dealer player = state.player # update epsilon epsilon = float(num_zero) / (num_zero + number_state[(dealer, player)]) # decide an action action = epsilon_greedy(action_value_function, state, epsilon) # count state and state_action number number_state[(dealer, player)] += 1 number_state_action[(dealer, player, action)] += 1 # take an action reward = step(state, action) if reward is None: # assign 0 if the match hasn't finished yet reward = 0 # update the action value function action_value_function[(dealer, player, action)] += \ (1. / number_state_action[(dealer, player, action)]) \ * (reward - action_value_function[(dealer, player, action)])
def sarsa(lambda_value, iteration_num): """ :rtype : MSE (float) """ print 'lambda:', lambda_value # define functions (dictionaries) action_value_function = ActionValue(float) number_state = defaultdict(int) number_state_action = defaultdict(int) # define parameters batch = 100 num_zero = 10 if lambda_value == 0. or lambda_value == 1.: learning_curve = [] # iterate over iteration_num for episode in xrange(iteration_num): if episode % batch == 0: print '\repisode:', episode, if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # initialize state, action, epsilon, and eligibility-trace state = environment.State() current_dealer = state.dealer current_player = state.player epsilon = float(num_zero) / (num_zero + number_state[(current_dealer, current_player)]) current_action = epsilon_greedy(action_value_function, state, epsilon) eligibility = defaultdict(int) while state.terminal is False: # count state and state_action number number_state[(current_dealer, current_player)] += 1 number_state_action[(current_dealer, current_player, current_action)] += 1 # take an action reward = step(state, current_action) if reward is None: # assign 0 if the match hasn't finished yet reward = 0 new_dealer = state.dealer new_player = state.player # update epsilon epsilon = float(num_zero) / (num_zero + number_state[(new_dealer, new_player)]) # decide an action new_action = epsilon_greedy(action_value_function, state, epsilon) # update alpha, delta, and eligibility-trace alpha = 1. / number_state_action[(current_dealer, current_player, current_action)] delta = reward + action_value_function[(new_dealer, new_player, new_action)] \ - action_value_function[(current_dealer, current_player, current_action)] eligibility[(current_dealer, current_player, current_action)] += 1 # iterate over every state and action # To Be Fixed: this implementation is probably slower than vectorized way for key in action_value_function.keys(): dealer, player, action = key # update the action value function action_value_function[(dealer, player, action)] \ += alpha * delta * eligibility[(dealer, player, action)] # update eligibility-trace eligibility[(dealer, player, action)] *= lambda_value # update state and action current_dealer = new_dealer current_player = new_player current_action = new_action print '\repisode:', episode print 'done!' if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # plot learning curve if lambda_value == 0. or lambda_value == 1.: x = range(0, iteration_num + 1, batch) pylab.title('Learning curve of Mean-Squared Error against episode number: lambda = ' + str(lambda_value)) pylab.xlabel("episode number") pylab.xlim([0, iteration_num]) pylab.xticks(range(0, iteration_num + 1, batch)) pylab.ylabel("Mean-Squared Error") pylab.plot(x, learning_curve) pylab.show() # calculate MSE print 'calculate the Mean-Squared Error...' MSE = calculate_mse(action_value_function) ## value function #value_function = action_value_function.to_value_function() ## plot the optimal value function #plot_value_function(value_function, "Optimal Value Function (Sarsa)") return MSE
for episode in xrange(iteration_num): if episode % 100 == 0: print '\repisode:', episode, # initialize the state state = environment.State() while state.terminal is False: dealer = state.dealer player = state.player # update epsilon epsilon = float(num_zero) / (num_zero + number_state[(dealer, player)]) # decide an action action = epsilon_greedy(action_value_function, state, epsilon) # count state and state_action number number_state[(dealer, player)] += 1 number_state_action[(dealer, player, action)] += 1 # take an action reward = step(state, action) if reward is None: # assign 0 if the match hasn't finished yet reward = 0 # update the action value function action_value_function[(dealer, player, action)] += \ (1. / number_state_action[(dealer, player, action)]) \ * (reward - action_value_function[(dealer, player, action)])