Exemplo n.º 1
0
def pivot(constraints: List[LinearFunction], objective_fn: LinearFunction,
          callback: Callable[[int, int], None]) -> None:
    """
    >>> from linear_function import LinearFunction
    >>> constraints = [                                                         \
        LinearFunction(18, [-2, -1, -1]),                                       \
        LinearFunction(30, [-1, -2, -2]),                                       \
        LinearFunction(24, [-2, -2, -2]),                                       \
    ]
    >>> objective_fn = LinearFunction(0, [6, 5, 4])
    >>> _pivot(constraints, objective_fn)
    >>> print(objective_fn)
    54 + -3*x₀ + 2*x₁ + 1*x₂
    >>> for i in constraints: print(i)
    9 + -1/2*x₀ + -1/2*x₁ + -1/2*x₂
    21 + 1/2*x₀ + -3/2*x₁ + -3/2*x₂
    6 + 1*x₀ + -1*x₁ + -1*x₂
    """
    arg_number = first_index(objective_fn.coefs, lambda x: x > 0)
    index = tightest_constraint(constraints, arg_number)
    assert -constraints[index].free / constraints[index].coefs[arg_number] > 0
    constraints[index].rearrange(arg_number)
    for c in skip_at(constraints, index):
        c.substitute(arg_number, constraints[index])
    objective_fn.substitute(arg_number, constraints[index])
    callback(arg_number, index)
Exemplo n.º 2
0
def calculate_mse(action_value_function):
    mc_action_value_function = load('mc_result.dat')
    linear_function = LinearFunction()
    mse, count = 0, 0

    for dealer in range(1, 11):
        for player in range(1, 22):
            for action in range(0, 2):
                state = State(dealer=dealer, player=player)
                linear_function.update(state)
                features = linear_function.get_features()

                mc_reward = mc_action_value_function[(dealer, player, action)]
                reward = action_value_function[(tuple(features), action)]
                mse += (reward - mc_reward) ** 2
                count += 1

    mse /= count
    return mse
Exemplo n.º 3
0
def sarsa(lambd):
    n_episodes = 1000
    epi_batch = 100
    episodes = xrange(n_episodes)
    action_value_function = defaultdict(float)
    linear_function = LinearFunction()
    params_hit = np.array([0 for i in range(18)])
    params_stick = np.array([0 for i in range(18)])
    n_zero = 10
    epsilon = 0.05
    alpha = 0.01

    if lambd == 0.0 or lambd == 1.0:
        mses = []

    for episode in episodes:
        if episode%epi_batch == 0:
            if lambd == 0.0 or lambd == 1.0:
                mses.append(calculate_mse(action_value_function))

        # initialize state, action, epsilon, and eligibility-trace
        state = State()
        linear_function.update(state)
        current_feats = linear_function.get_features()
        action = epsilon_greedy_policy(action_value_function, state, epsilon, current_feats)
        eligibility_hit = np.array([0 for i in range(18)])
        eligibility_stick = np.array([0 for i in range(18)])

        while not state.terminal:
            np_feats = np.array(current_feats)
            if action is HIT:
                eligibility_hit = np.add(eligibility_hit, np_feats)
            else:
                eligibility_stick = np.add(eligibility_stick, np_feats)

            reward = step(state, action)
            linear_function.update(state)
            new_features = linear_function.get_features()

            # update delta
            delta_hit = reward - np.array(tuple(new_features)).dot(params_hit)
            delta_stick = reward - np.array(tuple(new_features)).dot(params_stick)

            # update Action Value Function
            if action == HIT:
                update_action_value_function(action_value_function, (new_features, action), params_hit)
            else:
                update_action_value_function(action_value_function, (new_features, action), params_stick)

            # update delta, parameters, and eligibility-trace
            if action == HIT:
                delta_hit += action_value_function[(tuple(new_features), HIT)]
            else:
                delta_stick += action_value_function[(tuple(new_features), STICK)]

            params_hit = np.add(params_hit, alpha * delta_hit * eligibility_hit)
            params_stick = np.add(params_stick, alpha * delta_stick * eligibility_stick)
            eligibility_hit = eligibility_hit * lambd
            eligibility_stick = eligibility_stick * lambd

            # decide an action
            action = epsilon_greedy_policy(action_value_function, state, epsilon, new_features)

            # update state and action
            current_features = new_features


    if lambd == 0.0 or lambd == 1.0:
        mses.append(calculate_mse(action_value_function))

    # plot mses curve
    if lambd == 0.0 or lambd == 1.0:
        print "Plotting learning curve for $\lambda$=",lambd
        x = range(0, n_episodes + 1, epi_batch)
        fig = plt.figure()
        plt.title('Learning curve of MSE against Episodes @ $\lambda$ = ' + str(lambd))
        plt.xlabel("episode number")
        plt.xlim([0, n_episodes])
        plt.xticks(range(0, n_episodes + 1, epi_batch))
        plt.ylabel("Mean-Squared Error (MSE)")
        plt.plot(x, mses)
        fname = "lapprox_mse_lambda%f_%s.png" % (lambd, str(datetime.now()))
        plt.savefig(fname)
        # plt.show()

    mse = calculate_mse(action_value_function)

    return mse