def rollout_evaluation(start_state, weights): """ For a given weights vector, perform a single rollout Return the total reward """ # Initialize the simulator from the discrete start_state sim = AirplaneSimulator(dim=1, init_discrete_state=start_state) ########################################################################### # Perform roll out # Initialization total_reward = 0.0 # Get initial action features = extract_features(start_state, sim) action = generate_policy_action(features, weights, sim) # Loop till end state reached while sim.is_end_state(sim.state) == False: print sim.discrete_state next_state, reward = sim.controller_motion_y(action) total_reward += reward # Generate features and action for next state features = extract_features(next_state, sim) action = generate_policy_action(features, weights, sim) #-------------------------------------------------------------------------- # Return total_reward return total_reward
def rollout_evaluation_1step(action_list, nIter, next_state_vopt, t, y, vy, vw): """ Assume that start_state is always among the valid discrete states nIter is the number of iterations we run for each action Also input next_state_vopt numpy array Take the action Return estimate of (reward + v_opt) Return pi_opt """ # Initialize Qopt_average as numpy array Q_opt_average = np.zeros(len(action_list), dtype='float') + 1e-6 start_state = [t, y, vy, vw] # Loop over all actions for action in action_list: #print "correct here" # Initialize sum of Qopt Q_opt_sum = 0.0 # Run the simulation nIter number of times for iteration in range(nIter): # Initialize the simulator from the discrete start_state sim = AirplaneSimulator(dim=1, init_discrete_state=start_state) # Randomize the state sim.randomize_state_motion_y() # Take the action and get Qopt(s,a) next_state, reward = sim.controller_motion_y(action) # From next_state create loop-up key key = (next_state[1], next_state[2], next_state[3]) Q_opt_sum += reward + next_state_vopt[key] #print "correct here" # Get average Qopt Q_opt_average[action] = Q_opt_sum / nIter # Extract v_opt and pi_opt pi_opt, v_opt = max(enumerate(Q_opt_average), key=lambda tups: tups[1]) # Return pi_opt annd v_opt return (pi_opt, v_opt)
# Print episode number print "Episode number : ", epi # Generate a random initial discrete state with t = max_t t = max_t # y = random.choice([i for i in range(Const.BINS_Y)]) # vy = random.choice([i for i in range(Const.BINS_VY)]) # vw = random.choice([i for i in range(Const.BINS_VW)]) y = random.choice([i for i in range(12, 38)]) vy = random.choice([i for i in range(12, 38)]) vw = 0 # Initialize the simulator from discrete state # Initialize variables t_list and y_list sim = AirplaneSimulator(dim=1, init_discrete_state=[t, y, vy, vw]) t_list = [] y_list = [] # Randomize the initial state sim.randomize_state_motion_y() print "Start state = ", [t, y, vy, vw] # Follow the episode while sim.is_end_state(sim.state) == False: # Store the trajectory into variables t_list and y_list t_list.append(sim.state[0]) y_list.append(2 * sim.state[1] / (Const.Y_MAX_RUNWAY - Const.Y_MIN_RUNWAY))
weights["vw"] = 1.0 weights["constant"] = 1.0 # Return the weights vector return weights def cross_entropy(weights, num_samples, num_elite_samples): """ This is the cross entropy method, performs one iteration It takes an initial weights vector and updates the weights vector num_samples : rollout is performed these many times num_elite_samples : number of highest rating samples, MLE fit is done using these A normal distribution is fit over each parameter independently """ # Initialize the simulator t = 10 y = 50 vy = 50 vw = 20 sim = AirplaneSimulator(dim=1, init_discrete_state=[t, y, vy, vw]) start_state = [t, y, vy, vw] weights = extract_features(start_state, sim) for keys in weights: weights[keys] = random.uniform(0, 1) r = rollout_evaluation(start_state, weights) print "reward = ", r