Exemplo n.º 1
0
    def setUp(self):
        grid = [['0', '0', '0', '0', '10'], ['0', 'x', '0', '0', '-10'],
                ['0', '0', '0', '0', '0']]

        self.grid = grid
        self.gw_deterministic = gridworld.GridWorld(grid, {(0, 4), (1, 4)}, 1)
        self.gw_non_deterministic = gridworld.GridWorld(
            grid, {(0, 4), (1, 4)}, 0.8)
Exemplo n.º 2
0
def main():
    # Create environment
    env = gridworld.GridWorld(hard_version=False)

    # Initialize simulation
    s = env.reset()

    # Create log to store data from simulation
    log = {
        't': [0],
        's': [s],
        'a': [],
        'r': [],
    }

    # Simulate until episode is done
    done = False
    while not done:
        a = random.randrange(4)
        (s, r, done) = env.step(a)
        log['t'].append(log['t'][-1] + 1)
        log['s'].append(s)
        log['a'].append(a)
        log['r'].append(r)

    # Plot data and save to png file
    plt.plot(log['t'], log['s'])
    plt.plot(log['t'][:-1], log['a'])
    plt.plot(log['t'][:-1], log['r'])
    plt.legend(['s', 'a', 'r'])
    plt.savefig('results_gridworld.png')
Exemplo n.º 3
0
def test_probabilities_gridworld(size=5):
    """
    Check transition-probabilities for GridWorld

    Args:
        size: The size of the world to be used for testing.
    """
    check_zero_probabilities(gridworld.GridWorld(size))
Exemplo n.º 4
0
    def setUp(self):
        grid = [['0', '0', '0', '1'], ['0', 'x', '0', '-1'],
                ['0', '0', '0', '0']]

        self.grid = grid
        self.gw_non_deterministic = gridworld.GridWorld(
            grid, {(0, 3), (1, 3)}, 0.8)

        self.agent = value_iteration.ValueIterationAgent(
            self.gw_non_deterministic, 0.9, 100)
def generate_random_grid(base, num_event_cells, period_range, bound, mode='linear', stack=True, event_region=None):
    min_period, max_period = period_range
    free_spaces = np.argwhere(base == 0) if event_region is None else event_region
    cells = []
    for n in range(num_event_cells):
        obj = gridworld.Object(x=free_spaces[n, 1], y=free_spaces[n, 0], period=random.randint(min_period, max_period),
                               bound=bound)
        cells.append(obj)

    gw = gridworld.GridWorld(base, cells, person=None, viewable_distance=0, mode=mode, stack=stack)
    return gw
Exemplo n.º 6
0
    def __init__(self,
                 env_id=0,
                 is_default=True,
                 grid_size=[10, 10],
                 state_size=[16, 16]):

        # Initialize gridworld environment
        self.gridworld = gw.GridWorld(size=grid_size, default=is_default)

        # Initialize gridworld matrix
        self.gridmatrix = self.gridworld.CreateGridWorld()

        # Initialize initial energy
        self.initial_energy = 20.0

        # Initialize control position
        self.control_position = self.gridworld.GetStartPoint()

        # Initialize agent state
        self.agent_state = gw.AgentState(self.control_position[0],
                                         self.control_position[1])

        # Initialize state size
        self.state_size = state_size

        # Initialize state generator
        self.stategenerator = StateGenerator(state_size=state_size,
                                             grid_size=grid_size)

        # Initialize field state size for get_state
        self.fstate_size = [5, 5]

        # initialize previous parameters
        self.prev_grid_arr = self.gridworld.GetCurrentMatrix()
        self.prev_position = self.agent_state.GetCurrentPosition()

        # Initialize reward function
        self.rewardfunction = RewardFunction(pos_max=15, neg_min=-25)
        self.rewardfunction.set_delta_s(env_delta_s=self.gridworld.GetDeltaS())

        # Get gridworld endpoint
        self.endpoint = self.gridworld.GetEndPoint()

        # Initialize episode step count
        self.step_count = 0

        # Infinite resource environment parameters
        self.inf_resource = False
        self.p_terminate = -10
        self.max_steps = 200

        # Get instatnce id
        self.env_id = env_id
Exemplo n.º 7
0
def main():
    # Create environment
    env = gridworld.GridWorld(hard_version=False)

    # Initialize simulation
    s = env.reset()

    # Create log to store data from simulation
    log = {
        't': [0],
        's': [s],
        'a': [],
        'r': [],
    }

    pi = np.ones((25, 4)) / 4
    val_iter = ValueIteration(env, pi, 0.001, 0.95)

    # go through value iteration to find optimal policy
    val_iter.iterate()
    pi, v = val_iter.get_policy()

    # Simulate until episode is done
    done = False
    while not done:
        a = np.argmax(pi[s])
        (s, r, done) = env.step(a)

        log['t'].append(log['t'][-1] + 1)
        log['s'].append(s)
        log['a'].append(a)
        log['r'].append(r)

    # plot trajectory
    plt.plot(log['t'], log['s'])
    plt.plot(log['t'][:-1], log['a'])
    plt.plot(log['t'][:-1], log['r'])
    plt.legend(['s', 'a', 'r'])
    plt.title('Value Iteration Trajectory')
    plt.savefig('val_iter_gridworld.png')

    # plot learning curve
    plt.figure()
    plt.plot(np.arange(val_iter.get_steps()), val_iter.get_means())
    plt.title('Value Iteration Learning Curve')
    plt.savefig('val_iter_means.png')

    # visualize policy
    plt.figure()
    plt.pcolor(pi)
    plt.title('Value Iteration Policy')
    plt.savefig('val_iter_policy.png')
Exemplo n.º 8
0
def get_fidelity(height, model):
    input_size = height * 5 * 4
    grid = g.GridWorld(height)
    count = height * 5 - (height - 2)
    fidelity = 0
    for i in range(height):
        for j in range(5):
            # If not wall
            if not grid.state[(i, j)][2]:
                grid.place_player((i, j))
                q_value = model.predict(grid.state.reshape(1, input_size),
                                        batch_size=1)
                action = (np.argmax(q_value))
                fidelity += grid.check_optimal_policy((i, j), action)
    print(fidelity / count)
    return fidelity / count
Exemplo n.º 9
0
def main():

    #Initialize Gridworld Environment
    env = gridworld.GridWorld()

    #Initialize REINFORCE algorithm
    reinforce = REINFORCE(args, env)

    #run reinforce algorithm
    for episode in range(args.episodes):

        print('Episode ' + str(episode+1) + '/' + str(args.episodes))
        reinforce.train()

    #save results once training is finished
    reinforce.save_model()
Exemplo n.º 10
0
def test_training(model, height=3, num_of_steps=10):
    grid = g.GridWorld(height)
    input_size = height * 5 * 4
    total_reward = 0
    print("Initial State:")
    print(grid.display_grid())
    # while game still in progress
    for i in range(num_of_steps):
        q_value = model.predict(grid.state.reshape(1, input_size),
                                batch_size=1)
        # take action with highest Q-value
        action = (np.argmax(q_value))
        print('Move #: %s; Taking action: %s' % (i, action))
        grid.agent_move(action)
        grid.display_grid()
        reward = grid.get_reward()
        total_reward += reward
    print("Max steps reached, total reward: {}".format(total_reward))
Exemplo n.º 11
0
def generate_random_grid(base,
                         num_event_cells,
                         period_range,
                         bound,
                         mode='linear',
                         stack=True,
                         event_region=None,
                         extra_event_region=[]):
    min_period, max_period = period_range
    free_spaces = np.argwhere(
        base == 0) if event_region is None else event_region
    np.random.shuffle(free_spaces)
    cells = []
    for n in range(num_event_cells):
        obj = gridworld.Object(x=free_spaces[n, 1],
                               y=free_spaces[n, 0],
                               period=random.randint(min_period, max_period),
                               bound=bound)
        cells.append(obj)

    pos = (free_spaces[num_event_cells, 1], free_spaces[num_event_cells, 0])
    person = None
    if mode == "person":
        person = gridworld.Person(
            (free_spaces[num_event_cells, 1], free_spaces[num_event_cells, 0]))
        cells = [
            gridworld.Object(x=free_spaces[n, 1],
                             y=free_spaces[n, 0],
                             period=random.randint(min_period, max_period),
                             bound=bound) for n in range(len(free_spaces))
        ]

    gw = gridworld.GridWorld(base,
                             cells,
                             person=person,
                             initialpos=pos,
                             viewable_distance=0,
                             mode=mode,
                             stack=stack,
                             extra_event_region=extra_event_region)
    return gw
Exemplo n.º 12
0
  raise NotImplementedError()

  # You shouldn't need to touch this part.
  c = cvx.matrix(c)
  G = cvx.matrix(G)
  h = cvx.matrix(h)
  sol = cvx.solvers.lp(c, G, h)

  R = np.asarray(sol["x"][:nS]).squeeze()

  return R


if __name__ == "__main__":

  env = gridworld.GridWorld(map_name='8x8')

  # Generate policy from Q3.2.1
  gamma = 0.9
  Vs, n_iter = rl.value_iteration(env, gamma)
  policy = rl.policy_from_value_function(env, Vs, gamma)

  T = env.generateTransitionMatrices()

  # Q3.3.5
  # Set R_max and l1 as you want.
  R = irl_lp(policy, T, gamma, R_max, l1)

  # You can test out your R by re-running VI with your new rewards as follows:
  # env_irl = gridworld.GridWorld(map_name='8x8', R=R)
  # Vs_irl, n_iter_irl = rl.value_iteration(env_irl, gamma)
Exemplo n.º 13
0
def main():

    #Argument to initialize grid world
    if args.gridworld:

        #create environment
        env = gridworld.GridWorld(hard_version=False)

        #initializations
        P = env.p  # state transition probability
        R = env.r  # reward function
        V = np.zeros(env.num_states)  #state value function
        policy = np.zeros(env.num_states)
        Q = np.zeros(
            (env.num_states, env.num_actions))  #state action value function

        #Argument to run Value Iteration
        if args.value_iteration:

            V_optimal, policy_optimal, mean_VF_list, iterations = ValueIteration(
                args, P, R, V, policy, env)

            data = [V_optimal, policy_optimal, mean_VF_list, iterations]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./gridworld_data/'):
                os.makedirs('./gridworld_data/')
            filename = 'value_iteration_data.pkl'
            with open('gridworld_data/' + filename, 'wb') as f:
                pickle.dump(data, f)

        #Argument to run Policy Iteration
        if args.policy_iteration:

            V_optimal, policy_optimal, mean_VF_list, iterations = PolicyIteration(
                args, P, R, V, policy, env)

            data = [V_optimal, policy_optimal, mean_VF_list, iterations]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./gridworld_data/'):
                os.makedirs('./gridworld_data/')
            filename = 'policy_iteration_data.pkl'
            with open('gridworld_data/' + filename, 'wb') as f:
                pickle.dump(data, f)

        #Argument to run SARSA
        if args.SARSA:

            Q_optimal, policy_optimal, discounted_rewards = SARSA(args, Q, env)

            V_estimate = TD_Zero(args, V, policy_optimal, env)

            data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./gridworld_data/'):
                os.makedirs('./gridworld_data/')
            filename = 'SARSA_data_alpha=' + str(
                args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl'
            with open('gridworld_data/' + filename, 'wb') as f:
                pickle.dump(data, f)

        #Argument to run Q learning
        if args.q_learning:

            Q_optimal, policy_optimal, discounted_rewards = Q_learning(
                args, Q, env)

            V_estimate = TD_Zero(args, V, policy_optimal, env)

            data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./gridworld_data/'):
                os.makedirs('./gridworld_data/')
            filename = 'Q_Learning_data_alpha=' + str(
                args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl'
            with open('gridworld_data/' + filename, 'wb') as f:
                pickle.dump(data, f)

    #Argument to initialize grid world
    if args.pendulum:

        env = discrete_pendulum.Pendulum()

        #Initializations
        Q = np.zeros((env.num_states, env.num_actions))
        V = np.zeros(env.num_states)  #state value function

        #Argument to run SARSA
        if args.SARSA:

            Q_optimal, policy_optimal, discounted_rewards = SARSA(args, Q, env)

            V_estimate = TD_Zero(args, V, policy_optimal, env)

            data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./pendulum_data/'):
                os.makedirs('./pendulum_data/')
            filename = 'SARSA_data_alpha=' + str(
                args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl'
            with open('pendulum_data/' + filename, 'wb') as f:
                pickle.dump(data, f)

        #Argument to run Q learning
        if args.q_learning:

            Q_optimal, policy_optimal, discounted_rewards = Q_learning(
                args, Q, env)

            V_estimate = TD_Zero(args, V, policy_optimal, env)

            data = [Q_optimal, policy_optimal, discounted_rewards, V_estimate]

            #save data to pickle for deliverable generation
            if not os.path.isdir('./pendulum_data/'):
                os.makedirs('./pendulum_data/')
            filename = 'Q_Learning_data_alpha=' + str(
                args.alpha) + '_epsilon=' + str(args.epsilon) + '.pkl'
            with open('pendulum_data/' + filename, 'wb') as f:
                pickle.dump(data, f)
Exemplo n.º 14
0
             new_action):
    # Fill in this function
    return Q_table


# -------------------- #
#   Create the Task    #
# -------------------- #
# Task Parameters
task_name = short_hallway
action_error_prob = .1
pit_reward = -500
task = gridworld.GridWorld(task_name,
                           action_error_prob=action_error_prob,
                           rewards={
                               '*': 50,
                               'moved': -1,
                               'hit-wall': -1,
                               'X': pit_reward
                           })
task.get_max_reward()

# ---------------- #
#   Run the Task   #
# ---------------- #
# Algorithm Parameters
alpha = .5
epsilon = .1
gamma = .99
state_count = task.num_states
action_count = task.num_actions
episode_count = 250
Exemplo n.º 15
0
def run_qlearning():
    gw = gridworld.GridWorld()
    plt = plot.Plot()

    qlearn.q_learning(gw, plt)
def run_experiment(size):
    print "size", size
    rows = size
    cols = size

    #reward = [[0,0,0,-1,0],[0,-1,0,-1,0],[1,-1,0,0,0]] #true expert reward
    reward = np.reshape([np.random.randint(-10,10) for _ in range(rows*cols)],(rows,cols)) #true expert reward
    terminals = [] #no terminals, you can change this if you want
    gamma = 0.9 #discount factor for mdp
    grid = gridworld.GridWorld(reward, terminals, gamma) #create grid world
    #print "expert reward"
    #util.print_reward(grid)
    pi_star, V_star = mdp_solver.policy_iteration(grid) #solve for expert policy
    #print pi_star
    #print "expert policy"
    #util.print_policy(grid, pi_star)
    #print "expert value function"
    #util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols)))
    Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma)
    #print "expert Q-values"
    #print Q_star

    #give optimal action in each (non-terminal) state as demonstration
    #we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want
    demo = [(state, np.argmax(Q_star[state,:])) for state in range(grid.num_states) if state not in terminals]
    #print "demonstration", demo


    ####### gradient descent starting from random guess at expert's reward
    reward_guess = np.reshape([np.random.randint(-10,10) for _ in range(grid.num_states)],(grid.rows,grid.cols))

    #create new mdp with reward_guess as reward
    mdp = gridworld.GridWorld(reward_guess, terminals, gamma) #create markov chain

    start = timeit.default_timer()
    num_steps = 10
    c = 0.5 #we should experiment with step sizes
    print "----- gradient descent ------"
    for step in range(num_steps):
        #calculate optimal policy for current estimate of reward
        pi, V = mdp_solver.policy_iteration(mdp)
        #print "new policy"
        #print pi_star
        #calculate Q values for current estimate of reward
        Q = mdp_solver.calc_qvals(mdp, pi, V, gamma)
        #print "new Qvals"
        #print log-likelihood
        #print "log-likelihood posterior", birl.demo_log_likelihood(demo, Q)

        step_size = c / np.sqrt(step + 1)
        #print "stepsize", step_size 
        #calculate gradient of posterior wrt reward
        grad = birl.calc_reward_gradient(demo, mdp, mdp.R, eta=1.0)
        #update reward
        R_new = mdp.R + step_size * grad
        #print "new reward"
        #print R_new
        #update mdp with new reward 
        mdp.set_reward(R_new)
    stop = timeit.default_timer()
    #print "recovered reward"
    #util.print_reward(mdp)
    pi, V = mdp_solver.policy_iteration(mdp)
    #print "resulting optimal policy"
    #util.print_policy(mdp, pi)
    print "policy difference"
    #print np.linalg.norm(pi_star - pi)
    runtime = stop - start
    print "runtime for size", size, "=", runtime
    f = open("../results/runtime_size" + str(size) + ".txt", "w")
    f.write(str(runtime))
    f.close()
Exemplo n.º 17
0
def test_probabilities_gridworld(size=5):
    """
    Check transition-probabilities for GridWorld
    """
    check_zero_probabilities(gridworld.GridWorld(size))
Exemplo n.º 18
0
plt.plot(avg_causality_importance, label='C + IS')
plt.plot(ADAM_avg_base, '--', label='ADAM: Base Model')
plt.plot(ADAM_avg_baseline_importance, '--', label='ADAM: BS + IS')
plt.plot(ADAM_avg_baseline_causality, '--', label='ADAM: BS + C')
plt.plot(ADAM_avg_baseline_causality_importance,
         '--',
         label='ADAM: BS + C + IS')
plt.plot(ADAM_avg_causality_importance, '', label='ADAM: C+ IS')
plt.xlabel('Simulation Steps (10 Episodes/Step)')
plt.ylabel('Total Reward')
plt.title('Learning Curve Comparison (SGD vs ADAM): 10,000 Episodes')
plt.legend(bbox_to_anchor=(.975, 1.0), loc='upper left')
plt.savefig('./generated_results/SGD_ADAM_learning_curve_10000_episodes.png')
plt.show()

env = gridworld.GridWorld()

weights = base1_weights.detach().numpy()
policy = []
for s in range(env.num_states):
    policy.append(np.argmax(weights[s]))

policy = np.reshape(np.asarray(policy), (5, 5))

grid_x = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]
grid_y = [-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]

fig = plt.figure()
plt.imshow(policy, cmap='coolwarm')
plt.colorbar()
plt.xticks(grid_x)
Exemplo n.º 19
0
 def __init__(self, width, height, **kwargs):
     self._displayer = gridworld_displayer.PyGameDisplayer(width, height)
     self._gridworld = gridworld.GridWorld(width, height, **kwargs)
     self.prev_state = None
     self.state = self._gridworld.get_state()
Exemplo n.º 20
0
        yield l[idx:idx + n]


############ ここまで関数 ########### ここからmain ##############

if __name__ == '__main__':
    import gridworld
    from value_iteration import ValueIteration

    #setting env
    X, Y = 5, 5
    #setting reward
    grid_shape = [X, Y]
    reward = np.full(np.prod(grid_shape), 0.0)
    #setting expert
    env = gridworld.GridWorld(grid_shape, reward)
    gamma = 0.9
    #0.99,0.95,0.90,0.85,0.80
    #traj =[[20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 8, 3, 4], [20, 15, 16, 11, 6, 7, 8, 9, 4], [20, 21, 16, 11, 6, 7, 8, 9, 4], [20, 15, 16, 17, 12, 13, 14, 9, 4], [20, 21, 22, 17, 12, 7, 8, 3, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 3, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 21, 16, 11, 12, 7, 2, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 22, 17, 18, 13, 14, 9, 4], [20, 15, 10, 5, 6, 1, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 12, 7, 2, 3, 4], [20, 15, 10, 11, 12, 7, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 3, 4], [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 10, 11, 12, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 15, 16, 11, 6, 7, 8, 9, 4], [20, 15, 10, 11, 12, 13, 8, 9, 4], [20, 15, 16, 17, 18, 13, 14, 9, 4], [20, 21, 22, 17, 12, 13, 8, 3, 4], [20, 15, 16, 17, 18, 13, 8, 3, 4], [20, 21, 22, 17, 12, 7, 8, 3, 4], [20, 21, 22, 23, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 11, 12, 13, 8, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 21, 22, 17, 18, 13, 14, 9, 4], [20, 15, 10, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 15, 16, 11, 12, 13, 8, 3, 4], [20, 21, 22, 17, 12, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 21, 22, 17, 12, 13, 8, 3, 4], [20, 15, 16, 17, 18, 13, 8, 9, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 15, 10, 11, 12, 7, 8, 3, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 10, 11, 6, 1, 2, 3, 4], [20, 15, 10, 5, 6, 7, 8, 3, 4], [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 15, 10, 11, 6, 7, 8, 9, 4], [20, 21, 22, 23, 18, 19, 14, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 8, 3, 4], [20, 21, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 17, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 12, 13, 14, 9, 4], [20, 15, 10, 5, 0, 1, 2, 3, 4], [20, 15, 10, 11, 6, 7, 8, 9, 4], [20, 21, 22, 23, 18, 13, 8, 9, 4], [20, 21, 16, 17, 12, 13, 14, 9, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 15, 10, 11, 6, 7, 8, 3, 4], [20, 21, 16, 17, 18, 19, 14, 9, 4], [20, 21, 22, 17, 18, 13, 8, 9, 4], [20, 21, 16, 17, 12, 7, 2, 3, 4], [20, 15, 10, 11, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 21, 16, 17, 12, 7, 8, 9, 4], [20, 15, 16, 17, 18, 13, 8, 3, 4], [20, 21, 16, 11, 12, 7, 2, 3, 4], [20, 21, 16, 11, 12, 13, 8, 3, 4], [20, 15, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 18, 19, 14, 9, 4], [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 17, 12, 13, 8, 9, 4], [20, 21, 22, 17, 12, 7, 8, 9, 4], [20, 21, 22, 17, 18, 13, 8, 3, 4], [20, 21, 16, 11, 12, 7, 8, 9, 4], [20, 21, 22, 17, 12, 13, 14, 9, 4], [20, 15, 16, 17, 18, 19, 14, 9, 4], [20, 15, 16, 17, 12, 7, 2, 3, 4], [20, 15, 16, 11, 6, 1, 2, 3, 4], [20, 15, 16, 17, 18, 13, 8, 9, 4], [20, 21, 22, 17, 12, 7, 8, 9, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4]]
    traj = [[20, 21, 22, 23, 18, 13, 8, 9, 4],
            [20, 21, 16, 17, 18, 13, 14, 9,
             4], [20, 15, 10, 11, 12, 7, 2, 3, 4],
            [20, 15, 16, 17, 12, 13, 8, 3, 4],
            [20, 21, 16, 11, 12, 7, 8, 3, 4], [20, 15, 16, 17, 12, 7, 8, 3, 4],
            [20, 21, 22, 23, 24, 19, 14, 9, 4], [20, 15, 10, 5, 6, 7, 2, 3, 4],
            [20, 21, 16, 11, 6, 7, 2, 3, 4], [20, 21, 22, 23, 18, 13, 8, 9, 4],
            [20, 21, 22, 23, 24, 19, 14, 9, 4],
            [20, 21, 16, 17, 18, 13, 14, 9, 4],
            [20, 15, 10, 11, 12, 13, 14, 9,
             4], [20, 21, 16, 11, 6, 1, 2, 3, 4],
            [20, 15, 16, 11, 12, 13, 14, 9, 4], [20, 15, 10, 5, 6, 1, 2, 3, 4],
Exemplo n.º 21
0
### HELPER CODE ####

### INIITALIZE GRID ###

# Create the grid for Problem 2.
grid = ['..,..', '..,..', 'o.,..', '.?,.*']

# Create the Task
# Task Parameters

task = gridworld.GridWorld(grid,
                           terminal_markers={'*', '?'},
                           rewards={
                               '.': -1,
                               '*': 50,
                               '?': 5,
                               ',': -50,
                               'o': -1
                           })

# Algorithm Parameters
gamma = .75
state_count = task.num_states
action_count = task.num_actions
row_count = len(grid)
col_count = len(grid[0])

# -------------- #
#   Make Plots   #
# -------------- #
Exemplo n.º 22
0
def execute_configuration(config=DEFAULT_CONFIG,
                          row_index=0,
                          column_index=0,
                          height=1,
                          width=1):
    task = gridworld.GridWorld(TASK_MAP[config['task_name']],
                               action_error_prob=config['action_error_prob'],
                               rewards={
                                   '*': 50,
                                   'moved': -1,
                                   'hit-wall': -1,
                                   'X': config['pit_reward']
                               })
    task.get_max_reward()

    # Loop over some number of episodes
    episode_reward_set = np.zeros(
        (config['rep_count'], config['episode_count']))
    for rep_iter in range(config['rep_count']):

        # Initialize the Q table
        Q_table = np.zeros((task.num_states, task.num_actions))

        # Loop until the episode is done
        for episode_iter in range(config['episode_count']):

            # Start the task
            task.reset()
            state = task.observe()
            action = policy(state, Q_table, task.num_actions,
                            config['epsilon'])
            episode_reward_list = []
            task_iter = 0

            # Loop until done -- check when do we get the final state reward?
            while True:
                task_iter = task_iter + 1
                new_state, reward = task.perform_action(action)
                new_action = policy(new_state, Q_table, task.num_actions,
                                    config['epsilon'])

                # Update the Q_table.
                if config['method'] == 'sarsa':
                    Q_table = update_Q_SARSA(Q_table, config['alpha'],
                                             config['gamma'], state, action,
                                             reward, new_state, new_action)
                elif config['method'] == 'qlearning':
                    Q_table = update_Q_Learning(Q_table, config['alpha'],
                                                config['gamma'], state, action,
                                                reward, new_state)
                else:
                    sys.exit(
                        "Unrecognized algorithm %s. Consider adding support?" %
                        config['method'])

                # store the data
                episode_reward_list.append(reward)

                # stop if at goal/else update for the next iteration
                if task.is_terminal(
                        state) or task_iter > config['episode_max_length']:
                    episode_reward_set[rep_iter, episode_iter] = np.sum(
                        episode_reward_list)
                    break
                else:
                    state = new_state
                    action = new_action

    add_plot(config, Q_table, episode_reward_set, row_index, column_index,
             width, height)
Exemplo n.º 23
0
    print(grid.display_grid())
    # while game still in progress
    for i in range(num_of_steps):
        q_value = model.predict(grid.state.reshape(1, input_size),
                                batch_size=1)
        # take action with highest Q-value
        action = (np.argmax(q_value))
        print('Move #: %s; Taking action: %s' % (i, action))
        grid.agent_move(action)
        grid.display_grid()
        reward = grid.get_reward()
        total_reward += reward
    print("Max steps reached, total reward: {}".format(total_reward))


if __name__ == "__main__":
    if len(sys.argv) > 1:
        height = sys.argv[1]
        env = g.GridWorld(height)
    else:
        height = 3
        env = g.GridWorld()

    num_of_steps = 14
    for index in range(5):
        model = model_init(height)
        f = training_easy(env, model, 3, height, num_of_steps)
        test_training(model)
        plt.plot(f[0], f[1])
        plt.show()
Exemplo n.º 24
0
### HELPER CODE ####

### INIITALIZE GRID ###

# Create the grid for Problem 2.
grid = ['..,..', '..,..', 'o.,..', '.?,.*']

# Create the Task
# Task Parameters
action_error_prob = .2

task = gridworld.GridWorld(grid,
                           action_error_prob=action_error_prob,
                           terminal_markers={'*', '?'},
                           rewards={
                               '.': -1,
                               '*': 50,
                               '?': 5,
                               ',': -50,
                               'o': -1
                           })

# Algorithm Parameters
gamma = .75
state_count = task.num_states
action_count = task.num_actions
row_count = len(grid)
col_count = len(grid[0])

# -------------- #
#   Make Plots   #
# -------------- #
Exemplo n.º 25
0
        for j in range(width):
            if (isInt):
                sys.stdout.write("%6s" %
                                 str('%d' % printArray[(i * height) + j]) +
                                 " ")
            else:
                sys.stdout.write("%6s" %
                                 str('%02.2f' % printArray[(i * height) + j]) +
                                 " ")
        sys.stdout.write("\n\n")
    sys.stdout.flush()


if __name__ == "__main__":

    env = gridworld.GridWorld(map_name='8x8')

    # Generate policy from Q3.2.1
    gamma = 0.9
    Vs, n_iter = rl.value_iteration(env, gamma)
    policy = rl.policy_from_value_function(env, Vs, gamma)

    T = env.generateTransitionMatrices()

    # Q3.3.5
    # Set R_max and l1 as you want.
    R_max = 1
    l1 = 0.5
    R = irl_lp(policy, T, gamma, R_max, l1)

    printGridWorld("IRL-generated Rewards", R, 8, 8, False)
import numpy as np
import mdp_solver
import gridworld
import util
import birl

#gradient descent on reward
reward = [[0, 0, 0], [0, -1, 0], [1, -1, 0]]
terminals = [6]
gamma = 0.9
simple_world = gridworld.GridWorld(reward, terminals, gamma)
print "reward"
util.print_reward(simple_world)
pi_star, V_star = mdp_solver.policy_iteration(simple_world)
print "optimal policy"
util.print_policy(simple_world, pi_star)
Q_star = mdp_solver.calc_qvals(simple_world, pi_star, V_star, gamma)
print "q-vals"
print Q_star

#give optimal action in each state as demonstration
demo = [(state, np.argmax(Q_star[state, :]))
        for state in range(simple_world.num_states)]
print demo

#compute the gradient of R_guess
#TODO get an actual guess and update it towards real R
num_states = simple_world.num_states
num_actions = simple_world.num_actions

print "gradient"
Exemplo n.º 27
0
import numpy as np
import mdp_solver
import gridworld
import util
import birl_optimized as birl

##test script for running gradient descent for bayesian inverse reinforcement learning
##domain is a simple grid world (see gridworld.py)
##TODO I haven't incorporated a prior so this really is more of a maximum likelihood rather than bayesian irl algorithm

reward = [[0, 0, 0, -1, 0, 0, 0], [0, -1, 0, -1, 0, -1, 0],
          [0, -1, 0, -1, 0, -1, 0], [1, -1, 0, 0, 0, -1,
                                     0]]  #true expert reward
terminals = [21]  #no terminals, you can change this if you want
gamma = 0.95  #discount factor for mdp
grid = gridworld.GridWorld(reward, terminals, gamma)  #create grid world
print "expert reward"
util.print_reward(grid)
pi_star, V_star = mdp_solver.policy_iteration(grid)  #solve for expert policy
print pi_star
print "expert policy"
util.print_policy(grid, pi_star)
print "expert value function"
util.print_grid(grid, np.reshape(V_star, (grid.rows, grid.cols)))
Q_star = mdp_solver.calc_qvals(grid, pi_star, V_star, gamma)
print "expert Q-values"
print Q_star

#give optimal action in each (non-terminal) state as demonstration
#we can test giving demonstrations in some but not all states, or even noisy demonstrations to see what happens if we want
demo = [(state, np.argmax(Q_star[state, :]))
Exemplo n.º 28
0
    for j in range(X.shape[1]):
      if not text is None:
        v = text[int(X[i, j])]
      else:
        v = X[i, j]
        factor = 10.0 * dec
        v = math.trunc(v * factor) / factor
      ax.text(j, i, v, ha="center", va="center", color="w")
  
  plt.savefig(f"{title}.png")
  # plt.show()
  plt.close()

if __name__ == "__main__":
  mapname = "8x8"
  env = gridworld.GridWorld(map_name=mapname)
  gw, gh = int(mapname[0]), int(mapname[-1])

  # Play around with these values if you want!
  gamma = 0.9
  alpha = 0.05
  n = 4
  action_names = ['L', 'D', 'R', 'U']
  
  # Q3.2.1
  print(f"\n** q3.2.1 value iteration")
  V_vi, n_iter = value_iteration(env, gamma)
  plot(V_vi.reshape(gw, gh), title='value_iteration')
  print(f"value iteration converged after {n_iter} steps")
  policy = policy_from_value_function(env, V_vi, gamma)
  plot(policy.reshape(gw, gh), title='policy_from_value_iteration', text=action_names)