def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    state_action_matrix = np.random.random((4,12))
    print("State-Action Matrix:")
    print(state_action_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3,4))
    print("Utility Matrix:")
    print(utility_matrix)

    gamma = 0.999
    alpha = 0.001 #constant step size
    beta_matrix = np.zeros((4,12))
    tot_epoch = 300000
    print_epoch = 1000

    for epoch in range(tot_epoch):
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            #Estimating the action through Softmax
            col = observation[1] + (observation[0]*4)
            action_array = state_action_matrix[:, col]
            action_distribution = softmax(action_array)
            action = np.random.choice(4, 1, p=action_distribution)
            #To enable the beta parameter, enable the libe below
            #and add beta_matrix=beta_matrix in the update actor function
            #beta_matrix[action,col] += 1 #increment the counter
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            utility_matrix, delta = update_critic(utility_matrix, observation, 
                                                  new_observation, reward, alpha, gamma)
            state_action_matrix = update_actor(state_action_matrix, observation, 
                                               action, delta, beta_matrix=None)
            observation = new_observation
            if done: break

        if(epoch % print_epoch == 0):
            print("")
            print("Utility matrix after " + str(epoch+1) + " iterations:") 
            print(utility_matrix)
            print("")
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            print(state_action_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility_matrix)
    print("State-Action matrix after  " + str(tot_epoch) + " iterations:")
    print(state_action_matrix)
def main():
    world = GridWorld()
    q_table = np.zeros([len(world.available_actions()), 7, 10])
    q_table = train(world, q_table)
    moves = evaluate(world, q_table)
    print 'Moves: ' + str(moves)
    print 'Steps: ' + str(len(moves))
def init_or():
    '''Init the OR boolean environment

    @return the environment gridworld object
    '''
    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [-1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env, np.zeros((5,5))
Пример #4
0
def q_learning(q_tables,
               gamma=GAMMA,
               alpha=0.001,
               number_of_episodes=10000,
               max_step_number=1000):
    policy_list = []  # contains the final approximate optimal policy
    env = GridWorld()
    actions = ['up', 'down', 'left', 'right']
    indexes_actions = {-4: 0, 4: 1, -1: 2, 1: 3}
    rewards = 0
    for episode in range(number_of_episodes):
        obs = env.reset()
        number = 0  # the number of steps in one episode which is no more than max_step_number
        while True:  # one episode
            action = epsilon_greedy(obs, q_tables)  # action = A
            action_index = indexes_actions[action]
            next_obs, reward, done, _ = env.step(
                action)  # next_obs = S', reward = R
            rewards += reward
            q_tables[obs][
                action_index] = q_tables[obs][action_index] + alpha * (
                    reward + gamma * max(q_tables[next_obs]) -
                    q_tables[obs][action_index])
            obs = next_obs
            number += 1
            if done == 1 or number == max_step_number:  # reach final state or max number step
                break
    for row in range(len(q_tables)):
        policy_list.append(actions[np.argmax(q_tables[row])])
    performance = rewards / number_of_episodes
    optimal_policy = np.array(policy_list).reshape(4, 4)
    return q_tables, optimal_policy, performance
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    state_action_matrix = np.random.random((4, 12))
    print "State Action matrix"
    print state_action_matrix

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    print "utility matrix"
    print utility_matrix

    gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            col = observation[1] + (4 * observation[0])

            # Sending Action to Environment
            action_array = state_action_matrix[:, col]
            action_distribution = softmax(action_array)
            action = np.random.choice(4, 1, p=action_distribution)

            new_observation, reward, done = env.step(action)

            # Update Critic
            utility_matrix, delta = update_critic(utility_matrix, alpha,
                                                  observation, new_observation,
                                                  reward, gamma)
            # Update Actor
            state_action_matrix = update_actor(state_action_matrix,
                                               observation,
                                               action,
                                               delta,
                                               beta_matrix=None)

            observation = new_observation
            if done: break

    print "final utility matrix"
    print utility_matrix
    print "final state action matrix"
    print state_action_matrix
Пример #6
0
def _run(FLAGS, model_cls):
    logger = logging.getLogger('Trainer_%s' % model_cls.__name__)
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler('%s.log' % model_cls.__name__)
    file_handler.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] ## %(message)s')
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.addHandler(stream_handler)

    hparams = tf.contrib.training.HParams(**COMMON_HPARAMS.values())
    hparams.set_hparam('batch_size', FLAGS.bs)
    hparams.set_hparam('n_steps', FLAGS.stp)
    hparams.set_hparam('n_dims', FLAGS.dims)
    hparams.set_hparam('n_info_dims', FLAGS.info_dims)
    hparams.set_hparam('n_att_dims', FLAGS.att_dims)
    hparams.set_hparam('max_epochs', FLAGS.epochs)
    hparams.set_hparam('checkpoint', FLAGS.ckpt)
    hparams.set_hparam('n_heads', FLAGS.heads)
    hparams.set_hparam('n_selfatt_dims', FLAGS.selfatt_dims)

    assert hparams.n_dims == hparams.n_info_dims + hparams.n_att_dims, "`n_dims` should be equal to the sum of `n_info_dims` and `n_att_dims`"
    assert hparams.n_dims == hparams.n_heads * hparams.n_selfatt_dims, "`n_dims` should be equal to the product of `n_heads` and `n_selfatt_dims`"

    name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp)
    config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp)

    for name_std, config_std in CONFIG_STDS.iteritems():
        for name_drop, config_drop in CONFIG_DROPS.iteritems():
            for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems(
            ):
                config = Config()
                config.add('base', 'base', CONFIG_BASE)
                config.add('size', name_size, config_size)
                config.add('direction', name_direction, config_direction)
                config.add('drop', name_drop, config_drop)
                config.add('std', name_std, config_std)
                gridworld = GridWorld(name=config.get_name(),
                                      **config.get_kwargs())

                for seed in GRIDWORLD_SEEDS:
                    data_dir = '%s-SEED%d' % (config.get_name(), seed)
                    gridworld.load(data_dir,
                                   seed=seed,
                                   splitting_seed=SPLITTING_SEED)

                    dataset_name = config.get_name()
                    for shuffling_seed in SHUFFLING_SEEDS:
                        dataset = Dataset(dataset_name,
                                          os.path.join(BASE_DIR, data_dir),
                                          shuffling_seed=shuffling_seed)
                        model = model_cls(dataset,
                                          hparams,
                                          gridworld,
                                          seed=MODEL_SEED)
                        Trainer(model, logger)()
Пример #7
0
 def __init__(self, **args):
     start = args.get('start', Windy.Start)
     goal = args.get('goal', Windy.Goal)
     GridWorld.__init__(self,
                        Windy.Columns,
                        Windy.Rows,
                        start=start,
                        goal=goal)
     self.wind = args.get('wind', Windy.Wind)
Пример #8
0
def example_1():
    #example 1
    height = 6
    width = 2
    start = [5, 0]
    goals = ([5, 0])
    walls = ([2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3])
    cliffs = ([1, 1], [1, 2], [1, 3])
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)
    env.render(mode='simple_render')
Пример #9
0
def example_3():
    #example 3
    height = 3
    width = 3
    start = [0, 0]
    goals = ([2, 2])
    walls = None
    cliffs = None
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)
    env.render(mode='simple_render')
def init_env():
    '''Init the XOR boolean environment

    @return the environment gridworld object
    '''
    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [-1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env
Пример #11
0
    def test_move_dir(self):
        grid = '   \n   \n   '
        gw = GridWorld(grid)

        start = (1,1)
        # N, E, S, W
        tests = [(0, (1,0)), (1, (2,1)), (2, (1,2)), (3, (0,1))]

        for dir, end in tests:
            e, _, _ = gw.move_dir(start, dir)
            self.assertEqual(e, end)
Пример #12
0
def configure_gridworld() -> Tuple[Domain, Task]:
    domain = GridWorld(10,
                       7,
                       agent_x_start=0,
                       agent_y_start=3,
                       wind=True,
                       wind_strengths=[0, 0, 0, 1, 1, 1, 2, 1, 1, 0],
                       stochasticity=stochasticity)
    domain.place_exit(7, 3)
    task = ReachExit(domain)
    return domain, task
Пример #13
0
def create_env():
    """
    Создает среду для экспериментов

    :return: среду
    """
    # Создаем среду в виде сетки 3x4
    env = GridWorld(3, 4)
    # Задаем матрицу состояний
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    # Задаем матрицу вознаграждений
    # Для всех кроме терминальных состояний вознаграждение -0.04
    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    # Задаем матрицу вероятности совершения действия
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    # Настраиваем и возвращаем среду
    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env
def main():
	env = GridWorld(3, 4)
	state_matrix = np.zeros((3, 4))
	state_matrix[0, 3] = 1
	state_matrix[1, 3] = 1
	state_matrix[1, 1] = -1

	reward_matrix = np.full((3,4), -0.04)
	reward_matrix[0, 3] = 1
	reward_matrix[1, 3] = -1

	transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
	policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]])

	env.setStateMatrix(state_matrix)
	env.setRewardMatrix(reward_matrix)
	env.setTransitionMatrix(transition_matrix)

	state_action_matrix = np.zeros((4,12))
	visit_counter_matrix = np.zeros((4, 12))

	utility_matrix = np.zeros((3, 4))
	gamma, alpha, tot_epoch, print_epoch = 0.999, 0.001, 500000, 1000

	for epoch in range(tot_epoch):
		epsilon = return_decayed_value(0.1, epoch, decay_step=100000)
		observation = env.reset(exploring_starts=True)
		is_starting = True

		for step in range(1000):
			action = return_epsilon_greedy_action(policy_matrix, observation, epsilon=0.1)
			if is_starting:
				action = np.random.randint(0, 4)
				is_starting = False

			new_observation, reward, done = env.step(action)
			new_action = int(policy_matrix[new_observation[0]][new_observation[1]])

			state_action_matrix = update_state_action_matrix(state_action_matrix, reward, gamma, observation, new_observation, action, new_action, visit_counter_matrix)
			policy_matrix = update_policy(policy_matrix, state_action_matrix, observation)
			visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action)

			observation = new_observation
			if done: break

		if epoch % print_epoch == 0:
			print "state action and policy matrices after %d iterations: " %(epoch)
		        print state_action_matrix
                        print "best policy after %d iterations: " %(epoch)
                        print_policy(policy_matrix)
                        print "##################################"

	print "final state action matrix: ", state_action_matrix
	print "final policy matrix: ", policy_matrix
Пример #15
0
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]])
    trace_matrix = np.zeros((3, 4))

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            action = policy_matrix[observation[0]][observation[1]]
            new_observation, reward, done = env.step(action)

            delta = reward + gamma * utility_matrix[new_observation[0]][
                new_observation[1]] - utility_matrix[observation[0]][
                    observation[1]]
            trace_matrix[observation[0]][observation[1]] += 1

            utility_matrix = update_utility_matrix(utility_matrix, alpha,
                                                   delta, trace_matrix)
            trace_matrix = update_eligibility_matrix(trace_matrix, gamma,
                                                     lambda_)

            observation = new_observation
            if done: break
        if epoch % print_epoch == 0:
            print "utility matrix after %d iterations: " % (epoch)
            print utility_matrix

    print "final utility matrix: ", utility_matrix
Пример #16
0
def question_3_1_a(printing=True):
    if printing:
        print(
            'a) Develop a state graph representation for this search problem, and'
        )
        print(
            'develop a step() method for finding the next legal steps this problem,'
        )
        print('i.e. for generating successor nodes (vertices).')
        print()

    obstacle_coords = [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (3, 6), (4, 6),
                       (5, 6), (6, 6), (7, 6), (7, 3), (7, 4), (7, 5)]
    env_config = {
        'nrow': 9,
        'ncol': 9,
        'obstacle_coords': obstacle_coords,
        'start_coord': (8, 0),
        'goal_coord': (0, 8),
        'cost_map': None
    }
    env = GridWorld(env_config)
    if printing: print([i for i in dir(env) if '__' not in i], '\n')
    for k, v in vars(env).items():
        if k == 'cfg': continue
        if printing: print(k, v, '\n')
    return env
Пример #17
0
 def test_parse(self):
     grid = ' #P\nG #'
     gw = GridWorld(grid)
     self.assertEqual(gw.grid[1][0], '#')
     self.assertEqual(gw.grid[2][0], 'P')
     self.assertEqual(gw.grid[0][1], 'G')
     self.assertEqual(gw.grid[1][1], ' ')
Пример #18
0
class AnnRunner(object):
    """Wraps up the gross reality of running a ``print'' using the printer simulation (controlled by a neural network)"""

    camera_size = 3

    def __init__(self, ideal_grid_path, cell_size, units_per_cell=10):
        """Sets up all the pieces needed to perform a print with the simulated 3d printer (controlled by the neural network). Takes in a path to an a ``goal'' or ``ideal'' grid, and constructs the GridWorld based on the dimensions of that goal grid. Understands both a ``camera'', which observes the actual world (around the print head) and an ``ideal camera'' which observes the same location but based on the ``goal grid''
        """

        ideal_grid = Grid(path=ideal_grid_path, scale=cell_size)
        self.ideal_grid = ideal_grid
        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, cell_size)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = Printer(10, 10, 9, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot
        self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size)

    def run(self, n, iterations=10000):
        """Runs a simulated print run with the printer simulation (controlled by an ANN. Starts the printer in the location provided by the ideal grid spec
        """

        #set the printer location to the starting postition as defined by the ideal_grid spec
        self.printer.set_position_on_grid(*self.gridworld.get_starting_position())
        for i in xrange(iterations):
            self.printer.setPenDown()
            actual = self.camera.all_cell_values()
            ideal = self.ideal_camera.all_cell_values()
            pattern = [i - a for i,a in zip(actual, ideal)]
            result = n.propagate(pattern)
            result = [int(round(x)) for x in result]
            result = ''.join(map(str, result))
            self.printer.set_printer_direction(self.get_velocity(result[:2]), self.get_velocity(result[2:]))
            self.printer.simulate()
            self.update()
        return (self.ideal_grid, self.gridworld.grid)

    def update(self):
        return

    def get_velocity(self, instruction):
        """Translates between the output of the neural network and direction instructions for the printer. leftright and updown are translated separately"""
        if instruction == "10":
            return -1
        elif instruction == "01":
            return 1
        else:
            return 0
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    state_action_matrix = np.random.random((4, 12))
    print "State Action matrix"
    print state_action_matrix

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    print "utility matrix"
    print utility_matrix

    gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            col = observation[1] + (4 * observation[0])

            # Sending Action to Environment
            action_array = state_action_matrix[:, col]
            action_distribution = softmax(action_array)
            action = np.random.choice(4, 1, p=action_distribution)

            new_observation, reward, done = env.step(action)

            # Update Critic
            utility_matrix, delta = update_critic(utility_matrix, alpha, observation, new_observation, reward, gamma)
            # Update Actor
            state_action_matrix = update_actor(state_action_matrix, observation, action, delta, beta_matrix=None)

            observation = new_observation
            if done: break

    print "final utility matrix"
    print utility_matrix
    print "final state action matrix"
    print state_action_matrix
Пример #20
0
class GridWrapper(Game):
    def __init__(self, size, frame_stack_size=1, render=False):
        self.size = size
        self.new_game()

    def possible_actions(self):
        num_actions = 4
        actions = []
        for i in range(num_actions):
            new_action = [0] * num_actions
            new_action[i] = 1
            actions.append(new_action)
        return actions

    def perform_action(self, action):
        action_idx = np.argmax(action)
        self.game.perform_action(action_idx)

    def get_state(self):
        state, reward, terminal = self.game.get_state()
        for i in range(len(state)):
            for j in range(len(state[i])):
                state[i][j] = [state[i][j]]
        return state, reward, terminal

    def get_score(self):
        _, _, terminal = self.game.get_state()
        score = 0
        if terminal:
            score += 1
        score -= self.game.actions_taken * 0
        return score

    def goal_reached(self):
        return self.game.goal_reached

    def actions_taken(self):
        return self.game.actions_taken

    def new_game(self):
        self.game = GridWorld(self.size)
        self.min_moves = self.game.min_remaining_moves()

    def generate_states(self):
        return self.game.generate_states()
Пример #21
0
 def __init__(self, ideal_grid, units_per_cell=10):
     self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
     self.gridworld.set_ideal_grid(ideal_grid)
     self.printer = Printer(10, 10, 9, 1, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot
     self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size)
     self.ideal_grid = self.gridworld.ideal_grid
     self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size)
     width = self.gridworld.width() * self.gridworld.gridsize()
     height = self.gridworld.height() * self.gridworld.gridsize()
Пример #22
0
def load_gridworld(filename):
    grid = []
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            grid_row = []
            for col in row:
                grid_row.append(int(col))
            grid.append(grid_row)
    return GridWorld(grid)
Пример #23
0
    def __init__(self, ideal_grid_path, cell_size, units_per_cell=10):
        """Sets up all the pieces needed to perform a print with the simulated 3d printer (controlled by the neural network). Takes in a path to an a ``goal'' or ``ideal'' grid, and constructs the GridWorld based on the dimensions of that goal grid. Understands both a ``camera'', which observes the actual world (around the print head) and an ``ideal camera'' which observes the same location but based on the ``goal grid''
        """

        ideal_grid = Grid(path=ideal_grid_path, scale=cell_size)
        self.ideal_grid = ideal_grid
        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, cell_size)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = Printer(10, 10, 9, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot
        self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size)
Пример #24
0
    def __init__(self, ideal_grid):
        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld)
        self.camera = VisualCamera(self.gridworld, self.printer, 3)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3)

        #gui stuff
        pygame.init()
        width = self.gridworld.width() * self.gridworld.gridsize()
        height = self.gridworld.height() * self.gridworld.gridsize()
        self.window = pygame.display.set_mode((width, height))
Пример #25
0
def random_play(n_steps):
    #env from example_3
    height = 3
    width = 3
    start = [0, 0]
    goals = ([2, 2])
    walls = None
    cliffs = None
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)

    #random actions over n_steps:
    env.reset()
    for step in range(n_steps):
        action = env.action_space_sample()
        new_state, reward, done = env.step(action)
        print("Step:", step, ", Action:", action, ", New state:",
              env.get_obs(), ", Done:", done, ", Reward:", reward)
        env.render(mode='episode')
Пример #26
0
def main(args):
    if args.verbose:
        logging.basicConfig(level=logging.INFO)
    elif args.debug:
        logging.basicConfig(level=logging.DEBUG)
    # initializations
    gridworld = GridWorld(args.size, args.interval, args.obstacles, args.vision, args.phase)
    logging.info("Generated grid world!")
    logging.info("Visuals created")
    mc = MonteCarlo(gridworld, mode=args.method)
    logging.info("Initialized Monte Carlo method")

    mc.run()
Пример #27
0
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0], [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]])
    trace_matrix = np.zeros((3, 4))


    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            action = policy_matrix[observation[0]][observation[1]]
            new_observation, reward, done = env.step(action)

            delta = reward + gamma * utility_matrix[new_observation[0]][new_observation[1]] - utility_matrix[observation[0]][observation[1]]
            trace_matrix[observation[0]][observation[1]] += 1

            utility_matrix = update_utility_matrix(utility_matrix, alpha, delta, trace_matrix)
            trace_matrix = update_eligibility_matrix(trace_matrix, gamma, lambda_)

            observation = new_observation
            if done: break
        if epoch % print_epoch == 0:
            print "utility matrix after %d iterations: " %(epoch)
            print utility_matrix

    print "final utility matrix: ", utility_matrix
Пример #28
0
def generate_data(FLAGS):
    name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp)
    config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp)

    for name_std, config_std in CONFIG_STDS.iteritems():
        for name_drop, config_drop in CONFIG_DROPS.iteritems():
            for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems(
            ):
                config = Config()
                config.add('base', 'base', CONFIG_BASE)
                config.add('size', name_size, config_size)
                config.add('direction', name_direction, config_direction)
                config.add('drop', name_drop, config_drop)
                config.add('std', name_std, config_std)
                gridworld = GridWorld(name=config.get_name(),
                                      **config.get_kwargs())

                for seed in GRIDWORLD_SEEDS:
                    data_dir = '%s-SEED%d' % (config.get_name(), seed)
                    gridworld.generate(data_dir=data_dir,
                                       seed=seed,
                                       splitting_seed=SPLITTING_SEED)
Пример #29
0
 def test_norm_wind(self):
     env = GridWorld()
     state = env.reset()
     for _ in range(4):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([4, 4])))
     for _ in range(2):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([6, 6])))
     state, _, _ = env.step(3)  # move down
     self.assertTrue(np.array_equal(state, np.array([6, 6])))
     for _ in range(5):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([9, 6])))
     for _ in range(4):
         state, _, _ = env.step(3)  # move down
     for _ in range(2):
         state, _, done = env.step(2)  # move left
     self.assertTrue(done)
Пример #30
0
def init_nand(bias=True):
    '''Init the boolean environment

    @return the environment gridworld object
    '''
    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    if bias:
        return env, np.random.uniform(-1, 1, 3)
    else:
        return env, np.random.uniform(-1, 1, 2)
Пример #31
0
def run_instance(param):
	# runs sim with given parameters for different controllers and different trials and writes to results directory 

	# init environment 
	if param.env_name in 'gridworld':
		env = GridWorld(param)
	elif param.env_name in 'citymap':
		env = CityMap(param)
	else:
		exit('env_name not recognized: ', param.env_name)


	# run sim 
	for i_trial in range(param.n_trials):

		# init datasets
		if param.make_dataset_on:
			print('   making dataset...')
			train_dataset, test_dataset = datahandler.make_dataset(env)
			datahandler.write_dataset(env, train_dataset, test_dataset)
		print('   loading dataset...')
		datahandler.load_dataset(env)

		# initial condition
		s0 = env.get_s0()

		for controller_name in param.controller_names:
			controller = Controller(param,env,controller_name)
		
			# sim 
			sim_result = sim(param,env,controller,s0)
		
			# write results
			case_count = len(glob.glob('../current_results/*')) + 1
			results_dir = '../current_results/sim_result_{}'.format(case_count)
			datahandler.write_sim_result(sim_result, results_dir)
	return 
Пример #32
0
    def test_move(self):
        grid = ' #P\nG #'
        gw = GridWorld(grid, move_value=-1, die_value=-20, win_value=10)

        step_tests = [
            # move into wall
            ((0,0), (1,0), (0,0), -1, False),
            # move to free field
            ((0,0), (1,1), (1,1), -1, False),
            # move to goal
            ((0,0), (0,1), (0,1), 10, True),
            # die penalty
            ((0,0), (2,0), (2,0), -20, True),
            # out of bounds #1
            ((0,0), (-1,0), (0,0), -1, False),
            # out of bounds #1
            ((0,0), (10,0), (0,0), -1, False),
        ]

        for start, to, end, reward, is_terminal in step_tests:
            e, r, t = gw.move(start, to)
            self.assertEqual(e, end)
            self.assertEqual(r, reward)
            self.assertEqual(t, is_terminal)
Пример #33
0
    def __init__(self, ideal_grid=None, ideal_grid_path=None):
        """ Set pygame stuff up for running the simulation."""

        assert ideal_grid or ideal_grid_path, "must provide at least one ideal grid"

        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld)
        self.camera = VisualCamera(self.gridworld, self.printer, 3)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3)
        
        #gui stuff
        pygame.init()
        width = self.gridworld.width() * self.gridworld.gridsize()
        height = self.gridworld.height() * self.gridworld.gridsize()
        self.window = pygame.display.set_mode((width, height))
Пример #34
0
    def __init__(self):
        """ Set pygame stuff up for running the simulation."""

        pygame.init()
        grid = GridWorld(20, 20, 30)
        ideal_grid = Grid(20, 20, 30)
        ideal_grid.grid = [[1 if x <= 10 else 0 for x in range(20)] for _ in range(20)]
        grid.set_ideal_grid(ideal_grid)
        width = grid.width() * grid.gridsize()
        height = grid.height() * grid.gridsize()
        self.grid = grid
        self.window = pygame.display.set_mode((width, height))
        self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), grid)
        self.camera = VisualCamera(self.grid, self.printer, 3)
        self.grid.draw(self.window)
Пример #35
0
class AnnRunner:

    def __init__(self, ideal_grid):
        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld)
        self.camera = VisualCamera(self.gridworld, self.printer, 3)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3)

        #gui stuff
        pygame.init()
        width = self.gridworld.width() * self.gridworld.gridsize()
        height = self.gridworld.height() * self.gridworld.gridsize()
        self.window = pygame.display.set_mode((width, height))

    def run(self, n):
        self.printer.position = Vector(270, 150)
        while True:
            self.printer.setPenDown()
            actual = self.camera.camera.all_cell_values()
            ideal = self.ideal_camera.all_cell_values()
            pattern = [i - a for i,a in zip(actual, ideal)]
            result = n.propagate(pattern)
            result = [int(round(x)) for x in result]
            result = ''.join(map(str, result))
            self.printer.v = Vector(self.get_velocity(result[:2]), self.get_velocity(result[2:]))
            self.printer.simulate(1)
            self.redraw()
            pygame.display.update()

    def get_velocity(self, instruction):
        if instruction == "10":
            return -100
        elif instruction == "01":
            return 100
        else:
            return 0

    def redraw(self):
        self.gridworld.draw(self.window)
        self.printer.draw(self.window)
        self.camera.draw(self.window)
Пример #36
0
def main():
    grid = ''
    with open("grid.lay","r") as file:
        grid = file.read()

    eps = 0.2
    episodes = 10000

    random.seed(1)
    gw = GridWorld(grid)
    Q = SARSA(gw, episodes=episodes, eps=eps)
    # plotQ(Q, gw, f'SARSA after {episodes} episodes')
    plotPolicy(Q, gw, f'SARSA: greedy-policy after {episodes} episodes')

    random.seed(1)
    Q = QLearning(gw, episodes=episodes, eps=eps)
    # plotQ(Q, gw, f'Q-Learning after {episodes} episodes')
    plotPolicy(Q, gw, f'Q-Learning: greedy-policy after {episodes} episodes')
Пример #37
0
def plot_gridworld(n_rows=2,
                   n_cols=3,
                   figsize=(10, 6),
                   eps=0,
                   save_path='gridworld_demo.svg',
                   seed=42,
                   dtype='bool'):
    """Makes a picture of an expert trajectory

    :param n_rows: number of rows to put the grids in
    :param n_cols: number of columns to put the grids in
    :param figsize: figure size
    :param eps: probability of a random action por the expert
    :param save_path: path to save the result
    :param seed: random seed to set to numpy
    :param dtype: observation dtype. For checking that both dtypes work the same way
    """
    total = n_rows * n_cols
    np.random.seed(seed)
    env = GridWorld(5, 5, 3, obs_dtype=dtype)
    env.reset()
    done = False
    grids = [env.render(mode='get_grid')]
    while not done:
        action = env.get_expert_action(eps=eps)
        _, _, done, _ = env.step(action)
        grids.append(env.render(mode='get_grid'))
    if total < len(grids):
        display_ind = np.linspace(0, len(grids) - 1, total, dtype=int)
        grids = [grids[i] for i in display_ind]
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize)
    fig.suptitle('Example of an expert trajectory')
    for r in range(n_rows):
        for c in range(n_cols):
            ind = r * n_cols + c
            ax = axes[r, c]
            ax.set_axis_off()
            if ind < len(grids):
                grid = grids[ind]
                ax.imshow(grid)
    plt.savefig(save_path)
Пример #38
0
def first_visit_monte_carlo_evaluate(gamma=GAMMA, number_of_episodes=100000):
    env = GridWorld()
    policy = Get_Action()
    values = np.zeros(16)
    returns = {state: list() for state in range(16)}
    for episode in range(number_of_episodes):
        observations, _, rewards, _ = generate_one_episode(env, policy)
        observations.pop()  # exclude the sT
        G = 0
        for i, obs in enumerate(observations[::-1]
                                ):  # reverse the list observations and rewards
            G = gamma * G + rewards[::-1][i]
            if obs not in observations[::-1][i + 1:]:
                returns[obs].append(G)
                values[obs] = np.average(returns[obs])
            values[15] = 0
        if episode % 10000 == 0:
            print(f"In the No.{episode} the values are {values}.")
    return values
Пример #39
0
def main():
    cost_map = []
    cost_map.append([1, 1,  1,  5,  5,  5,  5, 1, None])
    cost_map.append([1, 1,  1,  5,  5,  5,  5, 1, 1])
    cost_map.append([1, 1, 10, 10, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1, 10, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1,  1, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1,  1, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1, 10, 10, 10, 10, 1, 1])
    cost_map.append([0, 1,  1,  1,  1,  1,  1, 1, 1])
    env_config = {'nrow': 9, 'ncol': 9, 'obstacle_coords': [],
        'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': cost_map}
    env = GridWorld(env_config)
    start = env.start_state
    goal = env.goal_state

    question_3_2_a(start, goal, env)
    question_3_2_b(start, goal, env)
    question_3_2_c(start, goal, env)
    question_3_2_d_and_e(start, goal, env)
    question_3_2_f(start, goal, env)
Пример #40
0
class AnnRunner(object):

    camera_size = 3

    def __init__(self, ideal_grid, units_per_cell=10):
        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = Printer(10, 10, 9, 1, self.gridworld, units_per_cell) #TODO: shouldn't be giving location values here when it's determined somewhere else. that smells a lot
        self.camera = Camera(self.gridworld.grid, self.printer, self.camera_size)
        self.ideal_grid = self.gridworld.ideal_grid
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, self.camera_size)
        width = self.gridworld.width() * self.gridworld.gridsize()
        height = self.gridworld.height() * self.gridworld.gridsize()

    def run(self, n, iterations=10000):
        self.printer.set_position_on_grid(self.ideal_grid.starting_point[0], self.ideal_grid.starting_point[1])
        for i in xrange(iterations):
            self.printer.setPenDown()
            actual = self.camera.all_cell_values()
            ideal = self.ideal_camera.all_cell_values()
            pattern = [i - a for i,a in zip(actual, ideal)]
            result = n.propagate(pattern)
            result = [int(round(x)) for x in result]
            result = ''.join(map(str, result))
            self.printer.v = Vector(self.get_velocity(result[:2]), self.get_velocity(result[2:]))
            self.printer.simulate()
            self.update()
        return (self.ideal_grid, self.gridworld.grid)

    def update(self):
        return

    def get_velocity(self, instruction):
        if instruction == "10":
            return -1
        elif instruction == "01":
            return 1
        else:
            return 0
Пример #41
0
 def setUp(self):
     self.gridworld = GridWorld(20, 20, 10) 
     self.printer = VirtualPrinter(0, 0, 10, 1, pygame.color.Color("darkorange"), self.gridworld)
     self.grid = self.gridworld.grid
     self.camera = Camera(self.grid, self.printer, 3)
Пример #42
0
class TestCamera(unittest.TestCase):
    def setUp(self):
        self.gridworld = GridWorld(20, 20, 10) 
        self.printer = VirtualPrinter(0, 0, 10, 1, pygame.color.Color("darkorange"), self.gridworld)
        self.grid = self.gridworld.grid
        self.camera = Camera(self.grid, self.printer, 3)

    def test_camera_has_correct_values_at_init(self):
        self.assertIs(self.camera.grid, self.gridworld.grid)
        self.assertIs(self.camera.printer, self.printer)
        self.assertEqual(self.camera.n, 3)
        self.assertEqual(self.camera.cell_width, self.gridworld.gridsize())

    def test_num_cells_in_view_isnt_wrong(self):
        #one cell
        self.printer.position = Vector(15, 15)
        self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 1)
        #two cell
        self.printer.position = Vector(15, 12)
        self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 2)
        #red cell
        self.printer.position = Vector(12, 12)
        self.assertEqual(self.camera.num_cells_in_view(Vector(1, 1)), 4)
        #blue cell

    def test_cells_in_view_independent_of_camera_size(self):
        local_camera = Camera(self.grid, self.printer, 4)
        self.printer.position = Vector(30, 30)
        self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 1)
        #two cell
        self.printer.position = Vector(30, 32)
        self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 2)
        #red cell
        self.printer.position = Vector(32, 32)
        self.assertEqual(local_camera.num_cells_in_view(Vector(1, 1)), 4)

    def test_cells_have_same_result_for_cells_in_view(self):
        #one cell
        self.printer.position = Vector(15, 15)
        self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 1)
        #two cell
        self.printer.position = Vector(15, 12)
        self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 2)
        #red cell
        self.printer.position = Vector(12, 12)
        self.assertEqual(self.camera.num_cells_in_view(Vector(3, 3)), 4)
        #blue cell
    
    def test_region_aligned(self):
        self.printer.position = Vector(15, 15)
        self.grid.set_loc_val(1, 1, 3)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 3)

    def test_region_over_two_cells_horizontally_aligned(self):
        self.printer.position = Vector(15, 20)
        self.grid.set_loc_val(1, 1, 1)
        self.grid.set_loc_val(1, 2, 0)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.5)

    def test_region_over_two_cells_vertically_aligned(self):
        self.printer.position = Vector(20, 15)
        self.grid.set_loc_val(1, 1, 1)
        self.grid.set_loc_val(2, 1, 0)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.5)
        self.printer.position = Vector(22, 15)
        self.grid.set_loc_val(1, 1, 1)
        self.grid.set_loc_val(2, 1, 0)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.7)

    def test_region_over_four_cells(self):
        self.printer.position = Vector(20, 20)
        self.grid.set_loc_val(1, 1, 1)
        self.grid.set_loc_val(2, 1, 0)
        self.grid.set_loc_val(1, 2, 0)
        self.grid.set_loc_val(2, 2, 0)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.25)

    def test_region_over_four_cells_with_arbitrary_camera_size(self):
        localcamera = Camera(self.grid, self.printer, 5)
        self.printer.position = Vector(20, 20)
        self.grid.set_loc_val(1, 1, 1)
        self.grid.set_loc_val(2, 1, 0)
        self.grid.set_loc_val(1, 2, 0)
        self.grid.set_loc_val(2, 2, 0)
        self.assertEqual(self.camera.percent_in_view(Vector(1, 1)), 0.25)
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    #Define the policy matrix
    #This is the optimal policy for world with reward=-0.04
    policy_matrix = np.array([[1,      1,  1,  -1],
                              [0, np.NaN,  0,  -1],
                              [0,      3,  3,   3]])
    print("Policy Matrix:")
    print(policy_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3,4))
    #init with 1.0e-10 to avoid division by zero
    running_mean_matrix = np.full((3,4), 1.0e-10) 
    gamma = 0.999
    tot_epoch = 50000
    print_epoch = 1000

    for epoch in range(tot_epoch):
        #Starting a new episode
        episode_list = list()
        #Reset and return the first observation and reward
        observation = env.reset(exploring_starts=False)
        for _ in range(1000):
            #Take the action from the action matrix
            action = policy_matrix[observation[0], observation[1]]
            #Move one step in the environment and get obs and reward
            observation, reward, done = env.step(action)
            #Append the visit in the episode list
            episode_list.append((observation, reward))
            if done: break
        #The episode is finished, now estimating the utilities
        counter = 0
        #Checkup to identify if it is the first visit to a state
        checkup_matrix = np.zeros((3,4))
        #This cycle is the implementation of First-Visit MC.
        #For each state stored in the episode list check if it
        #is the rist visit and then estimate the return.
        for visit in episode_list:
            observation = visit[0]
            row = observation[0]
            col = observation[1]
            reward = visit[1]
            if(checkup_matrix[row, col] == 0):
                return_value = get_return(episode_list[counter:], gamma)
                running_mean_matrix[row, col] += 1
                utility_matrix[row, col] += return_value
                checkup_matrix[row, col] = 1
            counter += 1
        if(epoch % print_epoch == 0):
            print("")
            print("Utility matrix after " + str(epoch+1) + " iterations:") 
            print(utility_matrix / running_mean_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility_matrix / running_mean_matrix)
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    #Random policy
    policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32)
    policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1)
    policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states

    #Set the matrices in the world
    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    state_action_matrix = np.random.random_sample((4,12)) # Q
    #init with 1.0e-10 to avoid division by zero
    running_mean_matrix = np.full((4,12), 1.0e-10) 
    gamma = 0.999
    tot_epoch = 500000
    print_epoch = 3000

    for epoch in range(tot_epoch):
        #Starting a new episode
        episode_list = list()
        #Reset and return the first observation and reward
        observation = env.reset(exploring_starts=True)
        #action = np.random.choice(4, 1)
        #action = policy_matrix[observation[0], observation[1]]
        #episode_list.append((observation, action, reward))
        is_starting = True
        for _ in range(1000):
            #Take the action from the action matrix
            action = policy_matrix[observation[0], observation[1]]
            #If the episode just started then it is
                #necessary to choose a random action (exploring starts)
            if(is_starting): 
                action = np.random.randint(0, 4)
                is_starting = False      
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            #Append the visit in the episode list
            episode_list.append((observation, action, reward))
            observation = new_observation
            if done: break
        #The episode is finished, now estimating the utilities
        counter = 0
        #Checkup to identify if it is the first visit to a state
        checkup_matrix = np.zeros((4,12))
        #This cycle is the implementation of First-Visit MC.
        #For each state stored in the episode list check if it
        #is the rist visit and then estimate the return.
        for visit in episode_list:
            observation = visit[0]
            action = visit[1]
            col = observation[1] + (observation[0]*4)
            row = action
            if(checkup_matrix[row, col] == 0):
                return_value = get_return(episode_list[counter:], gamma)
                running_mean_matrix[row, col] += 1
                state_action_matrix[row, col] += return_value
                checkup_matrix[row, col] = 1
            counter += 1
        #Policy Update
        policy_matrix = update_policy(episode_list, 
                                      policy_matrix, 
                                      state_action_matrix/running_mean_matrix)
        #Printing
        if(epoch % print_epoch == 0):
            print("")
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            print(state_action_matrix / running_mean_matrix)
            print("Policy matrix after " + str(epoch+1) + " iterations:") 
            print(policy_matrix)
            print_policy(policy_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(state_action_matrix / running_mean_matrix)
def main():
    tot_generations = 100
    tot_episodes = 100
    tot_steps = 14 #a good choice is: (world_rows+world_cols)*2
    population_size = 100
    elite_size = 10
    mutation_rate = 0.10
    gene_set = [0, 1, 2, 3]
    chromosome_size = 12

    #Define the world dimension
    world_rows = 3
    world_columns = 4
    env = GridWorld(world_rows, world_columns)

    mean_fitness_list = list()
    max_fitness_list = list()
    min_fitness_list = list()

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    #Init a random population
    population_matrix = return_random_population(population_size, 
                                                 chromosome_size,
                                                 gene_set = gene_set)
    print("Population matrix shape: " + str(population_matrix.shape))

    #Main iteration loop
    for generation in range(tot_generations):
        #The fitness value for each individual is stored in np.array
        fitness_array = np.zeros((population_size))
        for chromosome_index in range(population_size):
          for episode in range(tot_episodes):
            #Reset and return the first observation
            observation = env.reset(exploring_starts=True)
            for step in range(tot_steps):
              #Estimating the action for that state
              col = observation[1] + (observation[0]*world_columns)
              action = population_matrix[chromosome_index,:][col]
              #Taking the action and observing the new state and reward
              observation, reward, done = env.step(action)
              #Accumulating the fitness for this individual
              fitness_array[chromosome_index] += reward
              if done: break

        #Printing and saving Fitness information
        max_fitness_list.append(np.amax(fitness_array))
        mean_fitness_list.append(np.mean(fitness_array))
        min_fitness_list.append(np.amin(fitness_array))
        print("Generation: " + str(generation+1))
        print("Fitness Mean: " + str(np.mean(fitness_array)))
        print("Fitness STD: " + str(np.std(fitness_array)))
        print("Fitness Max: " + str(np.amax(fitness_array))
              + " at index " + str(np.argmax(fitness_array)))
        print("Fitness Min: " + str(np.amin(fitness_array))
              + " at index " + str(np.argmin(fitness_array)))
        print("Optimal Policy:")
        print(" >  >  >  *  ^  #  ^  *  ^  <  <  <")
        for i in range(int(fitness_array.shape[0]/10)):
            print("Fitness " + str(i) + " ..... " + str(fitness_array[i]))
            print(return_chromosome_string(population_matrix[i,:]))        
        print("")

        #Uncomment the following line to enable roulette wheel selection
        population_matrix, fitness_array = \
            return_roulette_selected_population(population_matrix,                                                  
                                                fitness_array,
                                                population_size)
        population_matrix, fitness_array = \
            return_best_worst_population(population_matrix, fitness_array)
        #Comment the following line if you enable the roulette wheel
        #population_matrix, fitness_array = \
            #return_truncated_population(population_matrix, 
                                        #fitness_array, 
                                        #new_size=int(population_size/2))
        population_matrix = return_crossed_population(population_matrix, 
                                                      population_size, 
                                                      elite=elite_size)
        population_matrix = return_mutated_population(population_matrix,
                                                      gene_set=gene_set,
                                                      mutation_rate=mutation_rate, 
                                                      elite=elite_size)

    #If you have matplotlib installed it saves an image of
    #the fitness/generation plot
    try:
        import matplotlib.pyplot as plt
        print("Using matplotlib to show the fitness/generation plot...")
        array = np.arange(1, tot_generations+1, dtype='int32')
        plt.plot(array, mean_fitness_list,  color='red', marker='o', markersize=6, markevery=10, label='Mean')
        plt.plot(array, max_fitness_list, color='blue', marker='^', markersize=6, markevery=10, label='Max')
        #plt.plot(array, min_fitness_list, color='black', marker='v', markersize=6, markevery=10, label='Min')
        plt.legend(loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=3, fancybox=True, shadow=True)
        #plt.xlim((0,tot_generations))
        #plt.ylim((-100,+100))
	plt.ylabel('Fitness', fontsize=15)
	plt.xlabel('Generation', fontsize=15)
        print("Saving the image in './fitness.jpg'...")
        plt.savefig("./fitness.jpg", dpi=500)
	#plt.show()
    except ImportError, e:
        print("Please install matplotlib if you want to see the fitness/generation plot.")
        pass # module doesn't exist, deal with it.
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    #Define the policy matrix
    #This is the optimal policy for world with reward=-0.04
    policy_matrix = np.array([[1,      1,  1,  -1],
                              [0, np.NaN,  0,  -1],
                              [0,      3,  3,   3]])
    print("Policy Matrix:")
    print(policy_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3,4))
    gamma = 0.999
    alpha = 0.1 #constant step size
    tot_epoch = 300000
    print_epoch = 1000

    for epoch in range(tot_epoch):
        #Reset and return the first observation
        observation = env.reset(exploring_starts=False)
        for step in range(1000):
            #Take the action from the action matrix
            action = policy_matrix[observation[0], observation[1]]
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            utility_matrix = update_utility(utility_matrix, observation, 
                                            new_observation, reward, alpha, gamma)
            observation = new_observation
            #print(utility_matrix)
            if done: break

        if(epoch % print_epoch == 0):
            print("")
            print("Utility matrix after " + str(epoch+1) + " iterations:") 
            print(utility_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility_matrix)
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE.

#In this example I will use the class gridworld to generate a 3x4 world
#in which the cleaning robot will move. In this example the robot follows
#a policy which is optimal, reaching the terminal state (+1) with high probability.

import numpy as np
from gridworld import GridWorld

env = GridWorld(3, 4)

#Define the state matrix
state_matrix = np.zeros((3,4))
state_matrix[0, 3] = 1
state_matrix[1, 3] = 1
state_matrix[1, 1] = -1

#Define the reward matrix
reward_matrix = np.full((3,4), -0.04)
reward_matrix[0, 3] = 1
reward_matrix[1, 3] = -1

#Define the transition matrix
transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                              [0.1, 0.8, 0.1, 0.0],
Пример #48
0
    hood = moore_neighborhood(radius=4, center=old_position)
    random.shuffle(hood)
    for location in hood:
        if agent.world.is_empty(location):
            return location
    return old_position

def schedule():
    for agent in myagents:
        move(agent)

def run(maxiter):
    for ct in range(maxiter):
        myobserver.off()
        schedule()
        myobserver.on()

#create a grid and a world based on this grid
mygrid = TorusGrid(shape=params['world_shape'])
myworld = GridWorld(topology=mygrid)
#create agents, located in `myworld`, and then set display suggestions
myagents = myworld.create_agents(AgentType=Agent, number=params['n_agents'])
for agent in myagents:
    agent.display(shape='circle', fillcolor='red', shapesize=(0.25,0.25))
#add an observer
myobserver = GridWorldGUI(myworld)
#run the simulation by repeatedly executing the schedule
run(maxiter=params['maxiter'])

#dw
Пример #49
0
class Generator:
    movement_constant = 3

    aquire_data = True

    def __init__(self, ideal_grid=None, ideal_grid_path=None):
        """ Set pygame stuff up for running the simulation."""

        assert ideal_grid or ideal_grid_path, "must provide at least one ideal grid"

        self.gridworld = GridWorld(ideal_grid.width, ideal_grid.height, ideal_grid.gridsize)
        self.gridworld.set_ideal_grid(ideal_grid)
        self.printer = VirtualPrinter(10, 10, 9, 1, pygame.color.Color("darkorange"), self.gridworld)
        self.camera = VisualCamera(self.gridworld, self.printer, 3)
        self.ideal_camera = Camera(self.gridworld.ideal_grid, self.printer, 3)
        
        #gui stuff
        pygame.init()
        width = self.gridworld.width() * self.gridworld.gridsize()
        height = self.gridworld.height() * self.gridworld.gridsize()
        self.window = pygame.display.set_mode((width, height))

    def generate(self, outputfile):
        inputs = []
        outputs = []
        self.printer.setPenDown()
        self.printer.v = Vector(0, 0)
        self.printer.position = Vector(270, 130)
        while self.aquire_data:
            actual = self.camera.camera.all_cell_values()
            ideal = self.ideal_camera.all_cell_values()
            inputs.append([i - a for a,i in zip(actual, ideal)])
            outputs.append([self.printer.v.x, self.printer.v.y])
            self.act_and_refresh()
        outputs = [[self.encode(x) + self.encode(y)] for x,y in outputs]
        self.aquire_data = True
        with open(outputfile, 'w') as output:
            writer = csv.writer(output)
            writer.writerow(camera_headers + output_headers)
            for inval, outval in zip(inputs, outputs):
                writer.writerow(inval + outval)

    def encode(self, velocity):
        if velocity >= 100:
            return "01"
        elif velocity <= -100:
            return "10"
        else:
            return "00"

    def act_and_refresh(self):
            self.act_on_key_input()
            self.printer.simulate(1)
            self.redraw()
            pygame.display.update()

    def act_on_key_input(self):
        for event in pygame.event.get(pygame.KEYUP):
            if event.key == pygame.K_p:
                self.print_all_camera_values()
        keys = pygame.key.get_pressed()
        if keys[pygame.K_LEFT]:
            self.printer.v = Vector(-100, 0)
        if keys[pygame.K_RIGHT]:
            self.printer.v = Vector(100, 0)
        if keys[pygame.K_UP]:
            self.printer.v = Vector(0, -100)
        if keys[pygame.K_DOWN]:
            self.printer.v = Vector(0, 100)
        if keys[pygame.K_SPACE]:
            self.printer.v = Vector(0, 0)
        if keys[pygame.K_q]:
            self.aquire_data = False

    def redraw(self):
        self.gridworld.draw(self.window)
        self.printer.draw(self.window)
        self.camera.draw(self.window)
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    #Define the policy matrix
    #This is the optimal policy for world with reward=-0.04
    policy_matrix = np.array([[1,      1,  1,  -1],
                              [0, np.NaN,  0,  -1],
                              [0,      3,  3,   3]])
    print("Policy Matrix:")
    print(policy_matrix)

    #Define and print the eligibility trace matrix
    trace_matrix = np.zeros((3,4))
    print("Trace Matrix:")
    print(trace_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3,4))
    gamma = 0.999 #discount rate
    alpha = 0.1 #constant step size
    lambda_ = 0.5 #decaying factor
    tot_epoch = 300000
    print_epoch = 100

    for epoch in range(tot_epoch):
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)        
        for step in range(1000):
            #Take the action from the action matrix
            action = policy_matrix[observation[0], observation[1]]
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            #Estimate the error delta (Target - OldEstimate)
            delta = reward + gamma * utility_matrix[new_observation[0], new_observation[1]] - \
                                     utility_matrix[observation[0], observation[1]]
            #Adding +1 in the trace matrix for the state visited
            trace_matrix[observation[0], observation[1]] += 1
            #Update the utility matrix
            utility_matrix = update_utility(utility_matrix, trace_matrix, alpha, delta)
            #Update the trace matrix (decaying)
            trace_matrix = update_eligibility(trace_matrix, gamma, lambda_)
            observation = new_observation
            if done: break #return

        if(epoch % print_epoch == 0):
            print("")
            print("Utility matrix after " + str(epoch+1) + " iterations:") 
            print(utility_matrix)
            print(trace_matrix)
    #Time to check the utility matrix obtained
    print("Utility matrix after " + str(tot_epoch) + " iterations:")
    print(utility_matrix)
	counter, return_value = 0, 0
	for visit in state_list:
		return_value += visit[2] * np.power(gamma, counter)
		counter += 1
	return return_value

def update_policy_matrix(episode_list, policy_matrix, state_action_matrix):
	for visit in episode_list:
		observation = visit[0]
		col = observation[1] + (observation[0] * 4)
		if policy_matrix[observation[0], observation[1]] != -1:
			policy_matrix[observation[0], observation[1]] = np.argmax(state_action_matrix[:, col])
	return policy_matrix

if __name__ == "__main__":
	env = GridWorld(3, 4)
	state_matrix = np.zeros((3,4))
	state_matrix[0, 3] = 1
	state_matrix[1, 3] = 1
	state_matrix[1, 1] = -1

	reward_matrix = np.full((3, 4), -0.04)
	reward_matrix[0, 3] = 1
	reward_matrix[1, 3] = -1

	transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
								[0.1, 0.8, 0.1, 0.0],
								[0.0, 0.1, 0.8, 0.1],
								[0.1, 0.0, 0.1, 0.8]])

Пример #52
0
        plt.ylabel("Return")
        plt.ylim(-4, 1)
        plt.plot(returns)
        plt.plot(estimates)
        plt.legend(['Returns', 'Estimate'])

        pp = PdfPages('./plots/qplot.pdf')
        pp.savefig(fig)
        plt.close()
        pp.close()

    return returns, estimates, q


if __name__ == '__main__':
    env = GridWorld()
    mdp = GridWorld_MDP()

    U, pi, Ustart = policy_iteration(mdp, plot=True)
    print(pi)
    for x in range(env.num_states):
        print("{} : {}".format(env.state2loc[x], U[x]))
    print("_________________")
    vret, vest, v = td_learning(env,
                                pi,
                                gamma=1.,
                                alpha=0.1,
                                episodes=2000,
                                plot=True)
    for x in range(env.num_states):
        print("{} : {}".format(env.state2loc[x], v[x]))
Пример #53
0
from gridworld import GridWorld

sim = GridWorld(60, 40, 10)

sim.set_cell(10, 10, (255, 255, 255))

sim.end
Пример #54
0
Файл: main.py Проект: pprp/52RL
def train(cfg):
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)

    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    # env = CliffWalkingWapper(env)

    gridmap = [
        'SFFFFH',
        'HHHFFH',
        'FFFFFH',
        'FFFFFH',
        'FHHHHH',
        'FFFFFG']
    env = GridWorld(gridmap)

    agent = QLearning(
        obs_dim=env.observation_space.n,
        action_dim=env.action_space.n,
        learning_rate=cfg.policy_lr,
        gamma=cfg.gamma,
        epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay)

    render = True  # 是否打开GUI画面
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps

    for i_episode in range(1, cfg.max_episodes+1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.sample(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            # 训练 Q-learning算法
            agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action

            obs = next_obs  # 存储上一个观察值
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if render:
                env.render()  # 渲染新的一帧图形
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(
                0.9*MA_rewards[-1]+0.1*ep_reward)
        print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,
                                                                          ep_reward, agent.epsilon))
        # 每隔20个episode渲染一下看看效果
        if i_episode % 20 == 0:
            render = True
        else:
            render = False
    agent.save()  # 训练结束,保存模型

    output_path = os.path.dirname(__file__)+"/result/"
    # 检测是否存在文件夹
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    np.save(output_path+"rewards_train.npy", rewards)
    np.save(output_path+"MA_rewards_train.npy", MA_rewards)
    np.save(output_path+"steps_train.npy", steps)
def main():

    env = GridWorld(3, 4)

    #Define the state matrix
    state_matrix = np.zeros((3,4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print("State Matrix:")
    print(state_matrix)

    #Define the reward matrix
    reward_matrix = np.full((3,4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print("Reward Matrix:")
    print(reward_matrix)

    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])

    #Random policy
    policy_matrix = np.random.randint(low=0, high=4, size=(3, 4)).astype(np.float32)
    policy_matrix[1,1] = np.NaN #NaN for the obstacle at (1,1)
    policy_matrix[0,3] = policy_matrix[1,3] = -1 #No action for the terminal states
    print("Policy Matrix:")
    print(policy_matrix)
    print_policy(policy_matrix)

    #Adversarial exploration policy
    #exploratory_policy_matrix = np.array([[2,      3, 2, -1],
                                          #[2, np.NaN, 1, -1],
                                          #[1,      1, 1,  0]])

    exploratory_policy_matrix = np.array([[1,      1, 1, -1],
                                          [0, np.NaN, 0, -1],
                                          [0,      1, 0,  3]])

    print("Exploratory Policy Matrix:")
    print(exploratory_policy_matrix)
    print_policy(exploratory_policy_matrix)

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    state_action_matrix = np.zeros((4,12))
    visit_counter_matrix = np.zeros((4,12))
    gamma = 0.999
    alpha = 0.001 #constant step size
    tot_epoch = 5000000
    print_epoch = 1000

    for epoch in range(tot_epoch):
        #Reset and return the first observation
        observation = env.reset(exploring_starts=True)
        epsilon = return_decayed_value(0.1, epoch, decay_step=50000)
        is_starting = True
        for step in range(1000):
            #Take the action from the action matrix
            #action = policy_matrix[observation[0], observation[1]]
            #Take the action using epsilon-greedy
            action = return_epsilon_greedy_action(exploratory_policy_matrix, observation, epsilon=0.001)
            if(is_starting): 
                action = np.random.randint(0, 4)
                is_starting = False  
            #Move one step in the environment and get obs and reward
            new_observation, reward, done = env.step(action)
            #Updating the state-action matrix
            state_action_matrix = update_state_action(state_action_matrix, visit_counter_matrix, observation, new_observation, 
                                                      action, reward, alpha, gamma)
            #Updating the policy
            policy_matrix = update_policy(policy_matrix, state_action_matrix, observation)
            #Increment the visit counter
            visit_counter_matrix = update_visit_counter(visit_counter_matrix, observation, action)
            observation = new_observation
            if done: break

        if(epoch % print_epoch == 0):
            print("")
            print("Epsilon: " + str(epsilon))
            print("State-Action matrix after " + str(epoch+1) + " iterations:") 
            print(state_action_matrix)
            print("Policy matrix after " + str(epoch+1) + " iterations:") 
            print_policy(policy_matrix)

    #Time to check the utility matrix obtained
    print("State-Action matrix after " + str(tot_epoch) + " iterations:")
    print(state_action_matrix)
    print("Policy matrix after " + str(tot_epoch) + " iterations:")
    print_policy(policy_matrix)