示例#1
0
def create_env():
    """
    Создает среду для экспериментов

    :return: среду
    """
    # Создаем среду в виде сетки 3x4
    env = GridWorld(3, 4)
    # Задаем матрицу состояний
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    # Задаем матрицу вознаграждений
    # Для всех кроме терминальных состояний вознаграждение -0.04
    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    # Задаем матрицу вероятности совершения действия
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    # Настраиваем и возвращаем среду
    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env
示例#2
0
 def test_parse(self):
     grid = ' #P\nG #'
     gw = GridWorld(grid)
     self.assertEqual(gw.grid[1][0], '#')
     self.assertEqual(gw.grid[2][0], 'P')
     self.assertEqual(gw.grid[0][1], 'G')
     self.assertEqual(gw.grid[1][1], ' ')
def init_or():
    '''Init the OR boolean environment

    @return the environment gridworld object
    '''
    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [-1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    return env, np.zeros((5,5))
def main():
    world = GridWorld()
    q_table = np.zeros([len(world.available_actions()), 7, 10])
    q_table = train(world, q_table)
    moves = evaluate(world, q_table)
    print 'Moves: ' + str(moves)
    print 'Steps: ' + str(len(moves))
示例#5
0
def q_learning(q_tables,
               gamma=GAMMA,
               alpha=0.001,
               number_of_episodes=10000,
               max_step_number=1000):
    policy_list = []  # contains the final approximate optimal policy
    env = GridWorld()
    actions = ['up', 'down', 'left', 'right']
    indexes_actions = {-4: 0, 4: 1, -1: 2, 1: 3}
    rewards = 0
    for episode in range(number_of_episodes):
        obs = env.reset()
        number = 0  # the number of steps in one episode which is no more than max_step_number
        while True:  # one episode
            action = epsilon_greedy(obs, q_tables)  # action = A
            action_index = indexes_actions[action]
            next_obs, reward, done, _ = env.step(
                action)  # next_obs = S', reward = R
            rewards += reward
            q_tables[obs][
                action_index] = q_tables[obs][action_index] + alpha * (
                    reward + gamma * max(q_tables[next_obs]) -
                    q_tables[obs][action_index])
            obs = next_obs
            number += 1
            if done == 1 or number == max_step_number:  # reach final state or max number step
                break
    for row in range(len(q_tables)):
        policy_list.append(actions[np.argmax(q_tables[row])])
    performance = rewards / number_of_episodes
    optimal_policy = np.array(policy_list).reshape(4, 4)
    return q_tables, optimal_policy, performance
示例#6
0
def question_3_1_a(printing=True):
    if printing:
        print(
            'a) Develop a state graph representation for this search problem, and'
        )
        print(
            'develop a step() method for finding the next legal steps this problem,'
        )
        print('i.e. for generating successor nodes (vertices).')
        print()

    obstacle_coords = [(2, 2), (2, 3), (2, 4), (2, 5), (2, 6), (3, 6), (4, 6),
                       (5, 6), (6, 6), (7, 6), (7, 3), (7, 4), (7, 5)]
    env_config = {
        'nrow': 9,
        'ncol': 9,
        'obstacle_coords': obstacle_coords,
        'start_coord': (8, 0),
        'goal_coord': (0, 8),
        'cost_map': None
    }
    env = GridWorld(env_config)
    if printing: print([i for i in dir(env) if '__' not in i], '\n')
    for k, v in vars(env).items():
        if k == 'cfg': continue
        if printing: print(k, v, '\n')
    return env
def _run(FLAGS, model_cls):
    logger = logging.getLogger('Trainer_%s' % model_cls.__name__)
    logger.setLevel(logging.INFO)
    file_handler = logging.FileHandler('%s.log' % model_cls.__name__)
    file_handler.setLevel(logging.INFO)
    stream_handler = logging.StreamHandler()
    stream_handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] ## %(message)s')
    file_handler.setFormatter(formatter)
    stream_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    logger.addHandler(stream_handler)

    hparams = tf.contrib.training.HParams(**COMMON_HPARAMS.values())
    hparams.set_hparam('batch_size', FLAGS.bs)
    hparams.set_hparam('n_steps', FLAGS.stp)
    hparams.set_hparam('n_dims', FLAGS.dims)
    hparams.set_hparam('n_info_dims', FLAGS.info_dims)
    hparams.set_hparam('n_att_dims', FLAGS.att_dims)
    hparams.set_hparam('max_epochs', FLAGS.epochs)
    hparams.set_hparam('checkpoint', FLAGS.ckpt)
    hparams.set_hparam('n_heads', FLAGS.heads)
    hparams.set_hparam('n_selfatt_dims', FLAGS.selfatt_dims)

    assert hparams.n_dims == hparams.n_info_dims + hparams.n_att_dims, "`n_dims` should be equal to the sum of `n_info_dims` and `n_att_dims`"
    assert hparams.n_dims == hparams.n_heads * hparams.n_selfatt_dims, "`n_dims` should be equal to the product of `n_heads` and `n_selfatt_dims`"

    name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp)
    config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp)

    for name_std, config_std in CONFIG_STDS.iteritems():
        for name_drop, config_drop in CONFIG_DROPS.iteritems():
            for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems(
            ):
                config = Config()
                config.add('base', 'base', CONFIG_BASE)
                config.add('size', name_size, config_size)
                config.add('direction', name_direction, config_direction)
                config.add('drop', name_drop, config_drop)
                config.add('std', name_std, config_std)
                gridworld = GridWorld(name=config.get_name(),
                                      **config.get_kwargs())

                for seed in GRIDWORLD_SEEDS:
                    data_dir = '%s-SEED%d' % (config.get_name(), seed)
                    gridworld.load(data_dir,
                                   seed=seed,
                                   splitting_seed=SPLITTING_SEED)

                    dataset_name = config.get_name()
                    for shuffling_seed in SHUFFLING_SEEDS:
                        dataset = Dataset(dataset_name,
                                          os.path.join(BASE_DIR, data_dir),
                                          shuffling_seed=shuffling_seed)
                        model = model_cls(dataset,
                                          hparams,
                                          gridworld,
                                          seed=MODEL_SEED)
                        Trainer(model, logger)()
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    state_action_matrix = np.random.random((4, 12))
    print "State Action matrix"
    print state_action_matrix

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    print "utility matrix"
    print utility_matrix

    gamma, alpha, tot_epoch, print_epoch = 0.999, 0.1, 30000, 1000

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            col = observation[1] + (4 * observation[0])

            # Sending Action to Environment
            action_array = state_action_matrix[:, col]
            action_distribution = softmax(action_array)
            action = np.random.choice(4, 1, p=action_distribution)

            new_observation, reward, done = env.step(action)

            # Update Critic
            utility_matrix, delta = update_critic(utility_matrix, alpha,
                                                  observation, new_observation,
                                                  reward, gamma)
            # Update Actor
            state_action_matrix = update_actor(state_action_matrix,
                                               observation,
                                               action,
                                               delta,
                                               beta_matrix=None)

            observation = new_observation
            if done: break

    print "final utility matrix"
    print utility_matrix
    print "final state action matrix"
    print state_action_matrix
示例#9
0
def load_gridworld(filename):
    grid = []
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            grid_row = []
            for col in row:
                grid_row.append(int(col))
            grid.append(grid_row)
    return GridWorld(grid)
示例#10
0
def example_1():
    #example 1
    height = 6
    width = 2
    start = [5, 0]
    goals = ([5, 0])
    walls = ([2, 1], [2, 2], [2, 3], [3, 1], [3, 2], [3, 3])
    cliffs = ([1, 1], [1, 2], [1, 3])
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)
    env.render(mode='simple_render')
示例#11
0
def example_3():
    #example 3
    height = 3
    width = 3
    start = [0, 0]
    goals = ([2, 2])
    walls = None
    cliffs = None
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)
    env.render(mode='simple_render')
示例#12
0
    def test_move_dir(self):
        grid = '   \n   \n   '
        gw = GridWorld(grid)

        start = (1,1)
        # N, E, S, W
        tests = [(0, (1,0)), (1, (2,1)), (2, (1,2)), (3, (0,1))]

        for dir, end in tests:
            e, _, _ = gw.move_dir(start, dir)
            self.assertEqual(e, end)
示例#13
0
def configure_gridworld() -> Tuple[Domain, Task]:
    domain = GridWorld(10,
                       7,
                       agent_x_start=0,
                       agent_y_start=3,
                       wind=True,
                       wind_strengths=[0, 0, 0, 1, 1, 1, 2, 1, 1, 0],
                       stochasticity=stochasticity)
    domain.place_exit(7, 3)
    task = ReachExit(domain)
    return domain, task
示例#14
0
def main(args):
    if args.verbose:
        logging.basicConfig(level=logging.INFO)
    elif args.debug:
        logging.basicConfig(level=logging.DEBUG)
    # initializations
    gridworld = GridWorld(args.size, args.interval, args.obstacles, args.vision, args.phase)
    logging.info("Generated grid world!")
    logging.info("Visuals created")
    mc = MonteCarlo(gridworld, mode=args.method)
    logging.info("Initialized Monte Carlo method")

    mc.run()
def main(args):
    os.makedirs(args.output_dir, exist_ok=True)

    for k in tqdm.trange(args.count):
        g = GridWorld(args.max_size, args.max_size)
        size = 2 * random.randint(args.min_size // 2, args.max_size // 2) + 1
        if size < args.max_size:
            pad = (args.max_size - size) // 2
            g._fill_rect(1, 1, pad, args.max_size)
            g._fill_rect(args.max_size - pad + 1, 1, args.max_size,
                         args.max_size)
            g._fill_rect(1, 1, args.max_size, pad)
            g._fill_rect(1, args.max_size - pad + 1, args.max_size,
                         args.max_size)
        else:
            pad = 0

        wall_count = random.randint(1, args.wall_count)
        for _ in range(wall_count):
            is_vert = random.random() > 0.5
            wall_coord = random.randint(2, size - 1)
            wall_len = random.randint(2, size - 2)
            wall_start = random.randint(1, size - wall_len)
            if is_vert:
                g.add_vertical_wall(pad + wall_coord, pad + wall_start,
                                    pad + wall_start + wall_len - 1)
            else:
                g.add_horizontal_wall(pad + wall_coord, pad + wall_start,
                                      pad + wall_start + wall_len - 1)

        connected_component = list(check_connectivity(g))
        random.shuffle(connected_component)

        start_count = random.randint(1, args.start_count)
        for _ in range(start_count):
            g.add_start(*connected_component.pop())

        goal_count = random.randint(1, args.goal_count)
        for _ in range(goal_count):
            g.add_goal(*connected_component.pop())

        trap_count = random.randint(0, args.trap_count)
        for _ in range(trap_count):
            g.add_trap(*connected_component.pop())

        g.save(
            os.path.join(
                args.output_dir,
                "grid{0:03d}_{1}x{1}_w{2}_s{3}_g{4}_t{5}.pkl".format(
                    k, size, wall_count, start_count, goal_count, trap_count)))
def main():
    env = GridWorld(3, 4)
    state_matrix = np.zeros((3, 4))
    state_matrix[0, 3] = 1
    state_matrix[1, 3] = 1
    state_matrix[1, 1] = -1
    print state_matrix

    reward_matrix = np.full((3, 4), -0.04)
    reward_matrix[0, 3] = 1
    reward_matrix[1, 3] = -1
    print reward_matrix

    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1], [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1], [0.1, 0.0, 0.1, 0.8]])
    policy_matrix = np.array([[1, 1, 1, -1], [0, np.NaN, 0, -1], [0, 3, 3, 3]])
    trace_matrix = np.zeros((3, 4))

    env.setStateMatrix(state_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)

    utility_matrix = np.zeros((3, 4))
    gamma, alpha, tot_epoch, print_epoch, lambda_ = 0.999, 0.1, 30000, 1000, 0.5

    for epoch in range(tot_epoch):
        observation = env.reset(exploring_starts=True)
        for step in range(1000):
            action = policy_matrix[observation[0]][observation[1]]
            new_observation, reward, done = env.step(action)

            delta = reward + gamma * utility_matrix[new_observation[0]][
                new_observation[1]] - utility_matrix[observation[0]][
                    observation[1]]
            trace_matrix[observation[0]][observation[1]] += 1

            utility_matrix = update_utility_matrix(utility_matrix, alpha,
                                                   delta, trace_matrix)
            trace_matrix = update_eligibility_matrix(trace_matrix, gamma,
                                                     lambda_)

            observation = new_observation
            if done: break
        if epoch % print_epoch == 0:
            print "utility matrix after %d iterations: " % (epoch)
            print utility_matrix

    print "final utility matrix: ", utility_matrix
示例#17
0
def main():
    grid = ''
    with open("grid.lay","r") as file:
        grid = file.read()

    eps = 0.2
    episodes = 10000

    random.seed(1)
    gw = GridWorld(grid)
    Q = SARSA(gw, episodes=episodes, eps=eps)
    # plotQ(Q, gw, f'SARSA after {episodes} episodes')
    plotPolicy(Q, gw, f'SARSA: greedy-policy after {episodes} episodes')

    random.seed(1)
    Q = QLearning(gw, episodes=episodes, eps=eps)
    # plotQ(Q, gw, f'Q-Learning after {episodes} episodes')
    plotPolicy(Q, gw, f'Q-Learning: greedy-policy after {episodes} episodes')
示例#18
0
def random_play(n_steps):
    #env from example_3
    height = 3
    width = 3
    start = [0, 0]
    goals = ([2, 2])
    walls = None
    cliffs = None
    env = GridWorld(height, width, False, False, start, goals, walls, cliffs)

    #random actions over n_steps:
    env.reset()
    for step in range(n_steps):
        action = env.action_space_sample()
        new_state, reward, done = env.step(action)
        print("Step:", step, ", Action:", action, ", New state:",
              env.get_obs(), ", Done:", done, ", Reward:", reward)
        env.render(mode='episode')
示例#19
0
 def test_norm_wind(self):
     env = GridWorld()
     state = env.reset()
     for _ in range(4):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([4, 4])))
     for _ in range(2):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([6, 6])))
     state, _, _ = env.step(3)  # move down
     self.assertTrue(np.array_equal(state, np.array([6, 6])))
     for _ in range(5):
         state, _, _ = env.step(0)  # move right
     self.assertTrue(np.array_equal(state, np.array([9, 6])))
     for _ in range(4):
         state, _, _ = env.step(3)  # move down
     for _ in range(2):
         state, _, done = env.step(2)  # move left
     self.assertTrue(done)
示例#20
0
def first_visit_monte_carlo_evaluate(gamma=GAMMA, number_of_episodes=100000):
    env = GridWorld()
    policy = Get_Action()
    values = np.zeros(16)
    returns = {state: list() for state in range(16)}
    for episode in range(number_of_episodes):
        observations, _, rewards, _ = generate_one_episode(env, policy)
        observations.pop()  # exclude the sT
        G = 0
        for i, obs in enumerate(observations[::-1]
                                ):  # reverse the list observations and rewards
            G = gamma * G + rewards[::-1][i]
            if obs not in observations[::-1][i + 1:]:
                returns[obs].append(G)
                values[obs] = np.average(returns[obs])
            values[15] = 0
        if episode % 10000 == 0:
            print(f"In the No.{episode} the values are {values}.")
    return values
示例#21
0
def plot_gridworld(n_rows=2,
                   n_cols=3,
                   figsize=(10, 6),
                   eps=0,
                   save_path='gridworld_demo.svg',
                   seed=42,
                   dtype='bool'):
    """Makes a picture of an expert trajectory

    :param n_rows: number of rows to put the grids in
    :param n_cols: number of columns to put the grids in
    :param figsize: figure size
    :param eps: probability of a random action por the expert
    :param save_path: path to save the result
    :param seed: random seed to set to numpy
    :param dtype: observation dtype. For checking that both dtypes work the same way
    """
    total = n_rows * n_cols
    np.random.seed(seed)
    env = GridWorld(5, 5, 3, obs_dtype=dtype)
    env.reset()
    done = False
    grids = [env.render(mode='get_grid')]
    while not done:
        action = env.get_expert_action(eps=eps)
        _, _, done, _ = env.step(action)
        grids.append(env.render(mode='get_grid'))
    if total < len(grids):
        display_ind = np.linspace(0, len(grids) - 1, total, dtype=int)
        grids = [grids[i] for i in display_ind]
    fig, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=figsize)
    fig.suptitle('Example of an expert trajectory')
    for r in range(n_rows):
        for c in range(n_cols):
            ind = r * n_cols + c
            ax = axes[r, c]
            ax.set_axis_off()
            if ind < len(grids):
                grid = grids[ind]
                ax.imshow(grid)
    plt.savefig(save_path)
def generate_data(FLAGS):
    name_size = 'SZ%d-STP%d' % (FLAGS.sz, FLAGS.stp)
    config_size = Config(size=FLAGS.sz, max_steps=FLAGS.stp)

    for name_std, config_std in CONFIG_STDS.iteritems():
        for name_drop, config_drop in CONFIG_DROPS.iteritems():
            for name_direction, config_direction in CONFIG_DIRECTIONS.iteritems(
            ):
                config = Config()
                config.add('base', 'base', CONFIG_BASE)
                config.add('size', name_size, config_size)
                config.add('direction', name_direction, config_direction)
                config.add('drop', name_drop, config_drop)
                config.add('std', name_std, config_std)
                gridworld = GridWorld(name=config.get_name(),
                                      **config.get_kwargs())

                for seed in GRIDWORLD_SEEDS:
                    data_dir = '%s-SEED%d' % (config.get_name(), seed)
                    gridworld.generate(data_dir=data_dir,
                                       seed=seed,
                                       splitting_seed=SPLITTING_SEED)
示例#23
0
def main():
    cost_map = []
    cost_map.append([1, 1,  1,  5,  5,  5,  5, 1, None])
    cost_map.append([1, 1,  1,  5,  5,  5,  5, 1, 1])
    cost_map.append([1, 1, 10, 10, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1, 10, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1,  1, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1,  1, 10, 10, 1, 1])
    cost_map.append([1, 1,  1,  1, 10, 10, 10, 1, 1])
    cost_map.append([1, 1,  1, 10, 10, 10, 10, 1, 1])
    cost_map.append([0, 1,  1,  1,  1,  1,  1, 1, 1])
    env_config = {'nrow': 9, 'ncol': 9, 'obstacle_coords': [],
        'start_coord': (8, 0), 'goal_coord': (0, 8), 'cost_map': cost_map}
    env = GridWorld(env_config)
    start = env.start_state
    goal = env.goal_state

    question_3_2_a(start, goal, env)
    question_3_2_b(start, goal, env)
    question_3_2_c(start, goal, env)
    question_3_2_d_and_e(start, goal, env)
    question_3_2_f(start, goal, env)
示例#24
0
def run_instance(param):
	# runs sim with given parameters for different controllers and different trials and writes to results directory 

	# init environment 
	if param.env_name in 'gridworld':
		env = GridWorld(param)
	elif param.env_name in 'citymap':
		env = CityMap(param)
	else:
		exit('env_name not recognized: ', param.env_name)


	# run sim 
	for i_trial in range(param.n_trials):

		# init datasets
		if param.make_dataset_on:
			print('   making dataset...')
			train_dataset, test_dataset = datahandler.make_dataset(env)
			datahandler.write_dataset(env, train_dataset, test_dataset)
		print('   loading dataset...')
		datahandler.load_dataset(env)

		# initial condition
		s0 = env.get_s0()

		for controller_name in param.controller_names:
			controller = Controller(param,env,controller_name)
		
			# sim 
			sim_result = sim(param,env,controller,s0)
		
			# write results
			case_count = len(glob.glob('../current_results/*')) + 1
			results_dir = '../current_results/sim_result_{}'.format(case_count)
			datahandler.write_sim_result(sim_result, results_dir)
	return 
示例#25
0
def init_nand(bias=True):
    '''Init the boolean environment

    @return the environment gridworld object
    '''
    env = GridWorld(5, 5)
    #Define the state matrix
    state_matrix = np.array([[1.0, 0.0, 0.0, 0.0, 1.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [0.0, 0.0, 0.0, 0.0, 0.0],
                             [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the index matrix
    index_matrix = np.array([[(4,0), (4,1), (4,2), (4,3), (4,4)],
                             [(3,0), (3,1), (3,2), (3,3), (3,4)],
                             [(2,0), (2,1), (2,2), (2,3), (2,4)],
                             [(1,0), (1,1), (1,2), (1,3), (1,4)],
                             [(0,0), (0,1), (0,2), (0,3), (0,4)]])
    #Define the reward matrix
    reward_matrix = np.array([[1.0, 0.0, 0.0, 0.0, -1.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [0.0, 0.0, 0.0, 0.0, 0.0],
                              [1.0, 0.0, 0.0, 0.0, 1.0]])
    #Define the transition matrix
    transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
                                  [0.1, 0.8, 0.1, 0.0],
                                  [0.0, 0.1, 0.8, 0.1],
                                  [0.1, 0.0, 0.1, 0.8]])
    env.setStateMatrix(state_matrix)
    env.setIndexMatrix(index_matrix)
    env.setRewardMatrix(reward_matrix)
    env.setTransitionMatrix(transition_matrix)
    if bias:
        return env, np.random.uniform(-1, 1, 3)
    else:
        return env, np.random.uniform(-1, 1, 2)
示例#26
0
    def test_move(self):
        grid = ' #P\nG #'
        gw = GridWorld(grid, move_value=-1, die_value=-20, win_value=10)

        step_tests = [
            # move into wall
            ((0,0), (1,0), (0,0), -1, False),
            # move to free field
            ((0,0), (1,1), (1,1), -1, False),
            # move to goal
            ((0,0), (0,1), (0,1), 10, True),
            # die penalty
            ((0,0), (2,0), (2,0), -20, True),
            # out of bounds #1
            ((0,0), (-1,0), (0,0), -1, False),
            # out of bounds #1
            ((0,0), (10,0), (0,0), -1, False),
        ]

        for start, to, end, reward, is_terminal in step_tests:
            e, r, t = gw.move(start, to)
            self.assertEqual(e, end)
            self.assertEqual(r, reward)
            self.assertEqual(t, is_terminal)
示例#27
0
文件: main.py 项目: pprp/52RL
def train(cfg):
    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
    # env = FrozenLakeWapper(env)

    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
    # env = CliffWalkingWapper(env)

    gridmap = [
        'SFFFFH',
        'HHHFFH',
        'FFFFFH',
        'FFFFFH',
        'FHHHHH',
        'FFFFFG']
    env = GridWorld(gridmap)

    agent = QLearning(
        obs_dim=env.observation_space.n,
        action_dim=env.action_space.n,
        learning_rate=cfg.policy_lr,
        gamma=cfg.gamma,
        epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay)

    render = True  # 是否打开GUI画面
    rewards = []  # 记录所有episode的reward
    MA_rewards = []  # 记录滑动平均的reward
    steps = []  # 记录所有episode的steps

    for i_episode in range(1, cfg.max_episodes+1):
        ep_reward = 0  # 记录每个episode的reward
        ep_steps = 0  # 记录每个episode走了多少step
        obs = env.reset()  # 重置环境, 重新开一局(即开始新的一个episode)
        while True:
            action = agent.sample(obs)  # 根据算法选择一个动作
            next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
            # 训练 Q-learning算法
            agent.learn(obs, action, reward, next_obs, done)  # 不需要下一步的action

            obs = next_obs  # 存储上一个观察值
            ep_reward += reward
            ep_steps += 1  # 计算step数
            if render:
                env.render()  # 渲染新的一帧图形
            if done:
                break
        steps.append(ep_steps)
        rewards.append(ep_reward)
        # 计算滑动平均的reward
        if i_episode == 1:
            MA_rewards.append(ep_reward)
        else:
            MA_rewards.append(
                0.9*MA_rewards[-1]+0.1*ep_reward)
        print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps,
                                                                          ep_reward, agent.epsilon))
        # 每隔20个episode渲染一下看看效果
        if i_episode % 20 == 0:
            render = True
        else:
            render = False
    agent.save()  # 训练结束,保存模型

    output_path = os.path.dirname(__file__)+"/result/"
    # 检测是否存在文件夹
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    np.save(output_path+"rewards_train.npy", rewards)
    np.save(output_path+"MA_rewards_train.npy", MA_rewards)
    np.save(output_path+"steps_train.npy", steps)
示例#28
0
from gridworld import GridWorld

sim = GridWorld(60, 40, 10)

sim.set_cell(10, 10, (255, 255, 255))

sim.end
示例#29
0
文件: my_hw5.py 项目: rpg711/cs383
        plt.ylabel("Return")
        plt.ylim(-4, 1)
        plt.plot(returns)
        plt.plot(estimates)
        plt.legend(['Returns', 'Estimate'])

        pp = PdfPages('./plots/qplot.pdf')
        pp.savefig(fig)
        plt.close()
        pp.close()

    return returns, estimates, q


if __name__ == '__main__':
    env = GridWorld()
    mdp = GridWorld_MDP()

    U, pi, Ustart = policy_iteration(mdp, plot=True)
    print(pi)
    for x in range(env.num_states):
        print("{} : {}".format(env.state2loc[x], U[x]))
    print("_________________")
    vret, vest, v = td_learning(env,
                                pi,
                                gamma=1.,
                                alpha=0.1,
                                episodes=2000,
                                plot=True)
    for x in range(env.num_states):
        print("{} : {}".format(env.state2loc[x], v[x]))
示例#30
0
    def go(self):

        # Subscribers to know the position
        pos_sub = rospy.Subscriber("/base_pose_ground_truth", Odometry,
                                   self.pos_callback)
        anemo_sub = rospy.Subscriber("/Anemometer/WindSensor_reading",
                                     anemometer, self.wind_callback)
        rospy.wait_for_message("/Anemometer/WindSensor_reading", anemometer,
                               rospy.Duration(5.0))

        # Probability mapping publisher
        prob_pub = rospy.Publisher("mapping_viz", OccupancyGrid, queue_size=10)
        prob_val = rospy.Publisher("max_prob_val", Float64, queue_size=10)

        # Choose between service and topic
        if self.use_service_for_gas:
            rospy.wait_for_service("odor_value")
            odor_req = rospy.ServiceProxy('odor_value', GasPosition)
        else:
            sensor_sub = rospy.Subscriber("/PID/Sensor_reading", gas_sensor,
                                          self.sensor_callback)
            rospy.wait_for_message("/PID/Sensor_reading", gas_sensor,
                                   rospy.Duration(4.0))

        # Get grid parameters
        grid = GridWorld()
        if self.adjust_tmax:
            self.got_initial_position = False
            self.x_prev = grid.xlims[1]

        # Algorithm specific parameters
        # Initializing matrices
        alpha = (1.0 / grid.M) * np.ones(grid.M)
        Sij = np.zeros(grid.M)
        beta = np.zeros(grid.M)
        gamma = (1.0 / grid.M) * np.ones(grid.M)
        decimal_shifter = 1000

        # OccupancyGrid
        prob = OccupancyGrid()
        prob.info.height = grid.m
        prob.info.width = grid.n
        prob.info.resolution = grid.res
        prob.info.origin.position.x = 0
        prob.info.origin.position.y = 0
        prob.info.origin.position.z = 0
        prob.info.map_load_time = rospy.Time.now()
        prob.header.frame_id = self.fixed_frame
        prob.data = (alpha * 100).astype(np.int8).tolist()

        self.start_time = rospy.Time.now()
        self.K = -1

        r = rospy.Rate(2)  # Might have to be changed later

        while not rospy.is_shutdown():
            if self.L is None:
                continue

            # Creating local saves at each timestep for continuously changing values
            x_pos, y_pos = self.x, self.y
            K = self.K

            # Read chemical concentration
            if self.use_service_for_gas:
                try:
                    odor_res = odor_req(x_pos, y_pos, grid.height)
                    gas_conc = odor_res.gas_conc[0]
                except rospy.ServiceException, e:
                    rospy.logerr("[mapping.py] Odor service call failed %s" %
                                 e)

            else:
                gas_conc = self.gas_conc

            # Check if detection occurs
            detection = gas_conc > self._conc_epsilon

            if not self.adjust_wind_interval(x_pos, y_pos, detect=detection):
                continue

            rospy.loginfo("x,y = [%.2f,%.2f], Gas concentration: %.2f", x_pos,
                          y_pos, gas_conc)

            # Wind values from t_L to t_K without accounting for the 'time' column of wind_history
            wind_data = np.delete(self.wind_history[self.L:K + 1], 2,
                                  1).astype(float)
            Vx, Vy = np.sum(wind_data, 0)

            beta[:] = 0
            gamma[:] = 1

            for t0 in range(self.L, K):

                tl, tk = self.wind_history[t0][2].to_sec(
                ), self.wind_history[K][2].to_sec()
                deviation_x = math.sqrt(tk - tl) * grid.sx
                deviation_y = math.sqrt(tk - tl) * grid.sy

                for i in range(0, grid.M):

                    deltax = x_pos - grid.xcell[i] - Vx
                    deltay = y_pos - grid.ycell[i] - Vy

                    Sij[i] = grid.res**2 * np.exp((-deltax**2)/(2*deviation_x**2)) * \
                                np.exp((-deltay**2)/(2*deviation_y**2)) /\
                                     (2*np.pi*deviation_x*deviation_y)

                try:
                    Sij /= np.sum(Sij)
                except RuntimeWarning:
                    rospy.logerr(
                        "All values of Sij = 0. sx and/or sy has to be changed"
                    )

                if detection:
                    beta = beta + Sij
                else:
                    gamma = gamma * (1 - grid.mu * Sij)

            if self.L != K:

                if detection:
                    beta /= (K - self.L)
                    alpha_k = grid.M * beta * alpha
                else:
                    alpha_k = (grid.M / np.sum(gamma)) * gamma * alpha

                alpha_k = alpha_k / np.sum(alpha_k)
                alpha = alpha_k

                # Probability map is scaled up by a factor to show the color in Rviz
                # Occupancy map supports only integers from 0-100
                prob.data = (alpha * decimal_shifter).astype(np.int8).tolist()
                prob.data = [100 if x > 100 else x for x in prob.data]

            if self.verbose:
                if self.L != 0:
                    rospy.loginfo("self.L = %d", self.L)

            prob_pub.publish(prob)
            prob_val.publish(Float64(np.max(alpha)))

            r.sleep()