def calculate_mse(subject_function): true_result = unpickle('result/MonteCarloControl.pkl') true_action_value_function = true_result['action_value'] linear_function = LinearFunction() # calculate the MSE MSE = 0 denominator = 0 for dealer in range(1, 11): for player in range(1, 22): for action in range(0, 2): state = State(dealer=dealer, player=player) linear_function.update(state) features = linear_function.get_features() MSE += (subject_function[(features, action)] - true_action_value_function[ (dealer, player, action)]) ** 2 denominator += 1 MSE /= denominator return MSE
def calculate_mse(subject_function): true_result = unpickle('result/MonteCarloControl.pkl') true_action_value_function = true_result['action_value'] linear_function = LinearFunction() # calculate the MSE MSE = 0 denominator = 0 for dealer in range(1, 11): for player in range(1, 22): for action in range(0, 2): state = State(dealer=dealer, player=player) linear_function.update(state) features = linear_function.get_features() MSE += ( subject_function[(features, action)] - true_action_value_function[(dealer, player, action)])**2 denominator += 1 MSE /= denominator return MSE
def sarsa(lambda_value, iteration_num): """ :rtype : MSE (float) """ print 'lambda:', lambda_value # define functions (dictionaries) action_value_function = ActionValueLinearApproximation(float) linear_function = LinearFunction() parameters_hit = np.array([0 for i in range(3 * 6)]) parameters_stick = np.array([0 for i in range(3 * 6)]) # define parameters batch = 100 num_zero = 10 epsilon = 0.1 alpha = 0.01 HIT = 0 STICK = 1 if lambda_value == 0. or lambda_value == 1.: learning_curve = [] # iterate over iteration_num for episode in xrange(iteration_num): if episode % batch == 0: print '\repisode:', episode, if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # initialize state, action, and eligibility-trace state = environment.State() linear_function.update(state) current_features = linear_function.get_features() action = epsilon_greedy_linear(action_value_function, current_features, epsilon) eligibility_hit = np.array([0 for i in range(3 * 6)]) eligibility_stick = np.array([0 for i in range(3 * 6)]) while state.terminal is False: # update delta, and eligibility-trace if action == HIT: eligibility_hit = np.add(eligibility_hit, np.array(current_features)) else: eligibility_stick = np.add(eligibility_stick, np.array(current_features)) # take an action reward = step(state, action) if reward is None: # assign 0 if the match hasn't finished yet reward = 0 linear_function.update(state) new_features = linear_function.get_features() # update delta delta_hit = reward - np.array(new_features).dot(parameters_hit) delta_stick = reward - np.array(new_features).dot(parameters_stick) # update Action Value Function if action == HIT: action_value_function.update_value((new_features, action), parameters_hit) else: action_value_function.update_value((new_features, action), parameters_stick) # update delta, parameters, and eligibility-trace if action == HIT: delta_hit += action_value_function[(new_features, HIT)] else: delta_stick += action_value_function[(new_features, STICK)] parameters_hit = np.add(parameters_hit, alpha * delta_hit * eligibility_hit) parameters_stick = np.add(parameters_stick, alpha * delta_stick * eligibility_stick) eligibility_hit = eligibility_hit * lambda_value eligibility_stick = eligibility_stick * lambda_value # decide an action action = epsilon_greedy_linear(action_value_function, new_features, epsilon) # update state and action current_features = new_features print '\repisode:', episode print 'done!' if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # plot learning curve if lambda_value == 0. or lambda_value == 1.: x = range(0, iteration_num + 1, batch) pylab.title( 'Learning curve of Mean-Squared Error against episode number: lambda = ' + str(lambda_value)) pylab.xlabel("episode number") pylab.xlim([0, iteration_num]) pylab.xticks(range(0, iteration_num + 1, batch)) pylab.ylabel("Mean-Squared Error") pylab.plot(x, learning_curve) pylab.show() # calculate MSE print 'calculate the Mean-Squared Error...' MSE = calculate_mse(action_value_function) ## value function #value_function = action_value_function.to_value_function() ## plot the optimal value function #plot_linear_value_function(action_value_function, "Optimal Value Function (Linear Approximation)") return MSE
def sarsa(lambda_value, iteration_num): """ :rtype : MSE (float) """ print 'lambda:', lambda_value # define functions (dictionaries) action_value_function = ActionValueLinearApproximation(float) linear_function = LinearFunction() parameters_hit = np.array([0 for i in range(3 * 6)]) parameters_stick = np.array([0 for i in range(3 * 6)]) # define parameters batch = 100 num_zero = 10 epsilon = 0.1 alpha = 0.01 HIT = 0 STICK = 1 if lambda_value == 0. or lambda_value == 1.: learning_curve = [] # iterate over iteration_num for episode in xrange(iteration_num): if episode % batch == 0: print '\repisode:', episode, if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # initialize state, action, and eligibility-trace state = environment.State() linear_function.update(state) current_features = linear_function.get_features() action = epsilon_greedy_linear(action_value_function, current_features, epsilon) eligibility_hit = np.array([0 for i in range(3 * 6)]) eligibility_stick = np.array([0 for i in range(3 * 6)]) while state.terminal is False: # update delta, and eligibility-trace if action == HIT: eligibility_hit = np.add(eligibility_hit, np.array(current_features)) else: eligibility_stick = np.add(eligibility_stick, np.array(current_features)) # take an action reward = step(state, action) if reward is None: # assign 0 if the match hasn't finished yet reward = 0 linear_function.update(state) new_features = linear_function.get_features() # update delta delta_hit = reward - np.array(new_features).dot(parameters_hit) delta_stick = reward - np.array(new_features).dot(parameters_stick) # update Action Value Function if action == HIT: action_value_function.update_value((new_features, action), parameters_hit) else: action_value_function.update_value((new_features, action), parameters_stick) # update delta, parameters, and eligibility-trace if action == HIT: delta_hit += action_value_function[(new_features, HIT)] else: delta_stick += action_value_function[(new_features, STICK)] parameters_hit = np.add(parameters_hit, alpha * delta_hit * eligibility_hit) parameters_stick = np.add(parameters_stick, alpha * delta_stick * eligibility_stick) eligibility_hit = eligibility_hit * lambda_value eligibility_stick = eligibility_stick * lambda_value # decide an action action = epsilon_greedy_linear(action_value_function, new_features, epsilon) # update state and action current_features = new_features print '\repisode:', episode print 'done!' if lambda_value == 0. or lambda_value == 1.: learning_curve.append(calculate_mse(action_value_function)) # plot learning curve if lambda_value == 0. or lambda_value == 1.: x = range(0, iteration_num + 1, batch) pylab.title('Learning curve of Mean-Squared Error against episode number: lambda = ' + str(lambda_value)) pylab.xlabel("episode number") pylab.xlim([0, iteration_num]) pylab.xticks(range(0, iteration_num + 1, batch)) pylab.ylabel("Mean-Squared Error") pylab.plot(x, learning_curve) pylab.show() # calculate MSE print 'calculate the Mean-Squared Error...' MSE = calculate_mse(action_value_function) ## value function #value_function = action_value_function.to_value_function() ## plot the optimal value function #plot_linear_value_function(action_value_function, "Optimal Value Function (Linear Approximation)") return MSE