예제 #1
0
def rollout_evaluation(start_state, weights):
    """
    For a given weights vector, perform a single rollout
    Return the total reward
    """
    # Initialize the simulator from the discrete start_state
    sim = AirplaneSimulator(dim=1, init_discrete_state=start_state)

    ###########################################################################
    # Perform roll out

    # Initialization
    total_reward = 0.0

    # Get initial action
    features = extract_features(start_state, sim)
    action = generate_policy_action(features, weights, sim)

    # Loop till end state reached
    while sim.is_end_state(sim.state) == False:
        print sim.discrete_state
        next_state, reward = sim.controller_motion_y(action)
        total_reward += reward

        # Generate features and action for next state
        features = extract_features(next_state, sim)
        action = generate_policy_action(features, weights, sim)
    #--------------------------------------------------------------------------

    # Return total_reward
    return total_reward
예제 #2
0
def rollout_evaluation_1step(action_list, nIter, next_state_vopt, t, y, vy,
                             vw):
    """
    Assume that start_state is always among the valid discrete states
    nIter is the number of iterations we run for each action
    Also input next_state_vopt numpy array
    Take the action
    Return estimate of (reward + v_opt)
    Return pi_opt
    """
    # Initialize Qopt_average as numpy array
    Q_opt_average = np.zeros(len(action_list), dtype='float') + 1e-6
    start_state = [t, y, vy, vw]
    # Loop over all actions
    for action in action_list:
        #print "correct here"

        # Initialize sum of Qopt
        Q_opt_sum = 0.0

        # Run the simulation nIter number of times
        for iteration in range(nIter):

            # Initialize the simulator from the discrete start_state
            sim = AirplaneSimulator(dim=1, init_discrete_state=start_state)

            # Randomize the state
            sim.randomize_state_motion_y()

            # Take the action and get Qopt(s,a)
            next_state, reward = sim.controller_motion_y(action)

            # From next_state create loop-up key
            key = (next_state[1], next_state[2], next_state[3])
            Q_opt_sum += reward + next_state_vopt[key]
        #print "correct here"

        # Get average Qopt
        Q_opt_average[action] = Q_opt_sum / nIter

    # Extract v_opt and pi_opt
    pi_opt, v_opt = max(enumerate(Q_opt_average), key=lambda tups: tups[1])

    # Return pi_opt annd v_opt
    return (pi_opt, v_opt)
예제 #3
0
        # Print episode number
        print "Episode number : ", epi

        # Generate a random initial discrete state with t = max_t
        t = max_t
        # y = random.choice([i for i in range(Const.BINS_Y)])
        # vy = random.choice([i for i in range(Const.BINS_VY)])
        # vw = random.choice([i for i in range(Const.BINS_VW)])
        y = random.choice([i for i in range(12, 38)])
        vy = random.choice([i for i in range(12, 38)])
        vw = 0

        # Initialize the simulator from discrete state
        # Initialize variables t_list and y_list
        sim = AirplaneSimulator(dim=1, init_discrete_state=[t, y, vy, vw])
        t_list = []
        y_list = []

        # Randomize the initial state
        sim.randomize_state_motion_y()
        print "Start state = ", [t, y, vy, vw]

        # Follow the episode
        while sim.is_end_state(sim.state) == False:

            # Store the trajectory into variables t_list and y_list
            t_list.append(sim.state[0])
            y_list.append(2 * sim.state[1] /
                          (Const.Y_MAX_RUNWAY - Const.Y_MIN_RUNWAY))
예제 #4
0
    weights["vw"] = 1.0
    weights["constant"] = 1.0

    # Return the weights vector
    return weights


def cross_entropy(weights, num_samples, num_elite_samples):
    """
    This is the cross entropy method, performs one iteration
    It takes an initial weights vector and updates the weights vector
    num_samples : rollout is performed these many times
    num_elite_samples : number of highest rating samples, MLE fit is done using these
    A normal distribution is fit over each parameter independently
    """


# Initialize the simulator
t = 10
y = 50
vy = 50
vw = 20
sim = AirplaneSimulator(dim=1, init_discrete_state=[t, y, vy, vw])

start_state = [t, y, vy, vw]
weights = extract_features(start_state, sim)
for keys in weights:
    weights[keys] = random.uniform(0, 1)

r = rollout_evaluation(start_state, weights)
print "reward = ", r