示例#1
0
def main():
    """Run Forzen Lake Value Iteration"""
 
    if not os.path.exists('output'):
        os.makedirs('output')

    pHeads = [0.4, 0.5, 0.6]
    for p in pHeads:
        GamblerValueIteration(GamblerEnv(p_heads=p), "GamblerHProb", p)

    maxCapital = [512, 777, 1023, 1024, 1025]
    for maxCap in maxCapital:
        GamblerCapitalValue(GamblerEnv(max_capital=maxCap), "GamblerCapVal", maxCap)

    GamblerTime("GamblerTime")
    
    env405 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=4, p=0.5))
    env408 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=4, p=0.8))
    env805 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=8, p=0.5))
    env808 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=8, p=0.8))
    runQ405 = multiprocessing.Process(target=runQ, args=(env405, 5, 4, 0.5, 20000, 1000, ))
    runP405 = multiprocessing.Process(target=runP, args=(env405, 5, 4, 0.5, 20000, ))
    runV405 = multiprocessing.Process(target=runV, args=(env405, 5, 4, 0.5, 20000, ))
    runQ405.start()
    runP405.start()
    runV405.start()
    runQ408 = multiprocessing.Process(target=runQ, args=(env408, 5, 4, 0.8, 20000, 1000, ))
    runP408 = multiprocessing.Process(target=runP, args=(env408, 5, 4, 0.8, 20000, ))
    runV408 = multiprocessing.Process(target=runV, args=(env408, 5, 4, 0.8, 20000, ))
    runQ408.start()
    runP408.start()
    runV408.start()
    runQ805 = multiprocessing.Process(target=runQ, args=(env805, 5, 8, 0.5, 20000, 1000, ))
    runP805 = multiprocessing.Process(target=runP, args=(env805, 5, 8, 0.5, 20000, ))
    runV805 = multiprocessing.Process(target=runV, args=(env805, 5, 8, 0.5, 20000, ))
    runQ805.start()
    runP805.start()
    runV805.start()
    runQ808 = multiprocessing.Process(target=runQ, args=(env808, 5, 8, 0.8, 20000, 1000, ))
    runP808 = multiprocessing.Process(target=runP, args=(env808, 5, 8, 0.8, 20000, ))
    runV808 = multiprocessing.Process(target=runV, args=(env808, 5, 8, 0.8, 20000, ))
    runQ808.start()
    runP808.start()
    runV808.start()

    runQ405.join()
    runP405.join()
    runV405.join()
    runQ408.join()
    runP408.join()
    runV408.join()
    runQ805.join()
    runP805.join()
    runV805.join()
    runQ808.join()
    runP808.join()
    runV808.join()
示例#2
0
def find_good_maps(map_p=0.8):
    sizes = MAP_SIZES
    # sizes = [4, 8]
    seeds = range(20)
    best_maps = {}

    for size in sizes:
        smallest_lost_games_perc = float('inf')
        best_map = None
        for seed in seeds:
            print(f'Finding best maps with size {size} (seed {seed})...')
            np.random.seed(seed)
            map = generate_random_map(size=size, p=map_p)
            env = FrozenLakeEnv(desc=map)
            optimal_policy, optimal_value_function = value_iteration(
                env, theta=0.0000001, discount_factor=0.999)
            optimal_policy_flat = np.where(optimal_policy == 1)[1]
            mean_number_of_steps, lost_games_perc = score_frozen_lake(
                env, optimal_policy_flat)
            if lost_games_perc < smallest_lost_games_perc:
                smallest_lost_games_perc = lost_games_perc
                best_map = map
        best_maps[size] = {
            'lost_games_perc': smallest_lost_games_perc,
            'map': best_map
        }

    with open(f'best_maps_{map_p}.json', "wb") as f:
        f.write(json.dumps(best_maps).encode("utf-8"))
    return best_maps
示例#3
0
    def get_env(self):

        random_map = generate_random_map(size=self.size, p=0.8)
        env = gym.make("FrozenLake-v0", desc=random_map)
        env.reset()
        env.render()
        return env
示例#4
0
def test_frozenlake_dfs_map_generation():
    def frozenlake_dfs_path_exists(res):
        frontier, discovered = [], set()
        frontier.append((0, 0))
        while frontier:
            r, c = frontier.pop()
            if not (r, c) in discovered:
                discovered.add((r, c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == "G":
                        return True
                    if res[r_new][c_new] not in "#H":
                        frontier.append((r_new, c_new))
        return False

    map_sizes = [5, 10, 200]
    for size in map_sizes:
        new_frozenlake = generate_random_map(size)
        assert len(new_frozenlake) == size
        assert len(new_frozenlake[0]) == size
        assert frozenlake_dfs_path_exists(new_frozenlake)
示例#5
0
def SecondGridWorld():
    #Create frozen lake env
    rand_map = generate_random_map(size=30, p=.8)
    env = gym.make("FrozenLake-v0")
    env.reset()

    nA, nS = env.nA, env.nS
    T = np.zeros([nA, nS, nS])
    R = np.zeros([nS, nA])

    for s in range(nS):
        for a in range(nA):
            transitions = env.P[s][a]
            for p_trans, next_s, rew, done in transitions:
                T[a, s, next_s] += p_trans
                R[s, a] = rew
        T[a, s, :] /= np.sum(T[a, s, :])

    q = mdp.QLearning(T, R, .98)
    q.run()
    qdf = pd.DataFrame(q.run_stats)
    qdf.to_csv(
        "C:/Users/wtomjack/.spyder/CS-7641-/ReinforcementLearning/frozenQ.csv")

    pi = mdp.PolicyIteration(T, R, .98)
    pi.run()
    print len(pi.policy)

    vi = mdp.ValueIteration(T, R, .98)
    vi.run()
    print len(vi.policy)
示例#6
0
def test_frozenlake_dfs_map_generation():

    def frozenlake_dfs_path_exists(res):
        frontier, discovered = [], set()
        frontier.append((0,0))
        while frontier:
            r, c = frontier.pop()
            if not (r,c) in discovered:
                discovered.add((r,c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == 'G':
                        return True
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False

    map_sizes = [5, 10, 200]
    for size in map_sizes:
        new_frozenlake = generate_random_map(size)
        assert len(new_frozenlake) == size
        assert len(new_frozenlake[0]) == size
        assert frozenlake_dfs_path_exists(new_frozenlake)
示例#7
0
    def init_envs(self, env):
        if env == 'TH':
            self.key = 'TH'
            self.noise = 0.0
            # self.N_range = list(range(2, 6))
            print("initiaiting envs: \n")
            for N in tqdm.tqdm(self.N_range):
                state_N = tuple(range(N, -1, -1))
                # env = TohEnv(initial_state=((3, 2, 1, 0), (), ()), goal_state=((), (), (3, 2, 1, 0)), noise=0)

                env = TohEnv(initial_state=(state_N, (), ()),
                             goal_state=((), (), state_N),
                             noise=self.noise)
                self.env_list.append(env)
                self.env = self.env_list[0]

        elif env == 'FL':
            self.key = 'FL'
            self.FL_maps = {}
            # self.N_range = list(range(4,20))
            print("initiaiting envs: \n")
            for N in tqdm.tqdm(self.N_range):
                np.random.seed(777)
                self.noise = 0.8
                self.FL_maps[N] = generate_random_map(size=N, p=self.noise)
                self.env_list.append(
                    gym.make("FrozenLake-v0", desc=self.FL_maps[N]))
                self.env = self.env_list[0]
        else:
            raise KeyError
    def __init__(self, desc=None, map_name="4x4", is_slippery=True):
        if desc is None and map_name is None:
            desc = generate_random_map()
        elif desc is None:
            desc = MAPS[map_name]
        self.desc = desc = np.asarray(desc, dtype='c')
        self.nrow, self.ncol = nrow, ncol = desc.shape
        self.reward_range = (0, 1)

        nA = 4
        nS = nrow * ncol

        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        P = {s: {a: [] for a in range(nA)} for s in range(nS)}

        def to_s(row, col):
            return row * ncol + col

        def inc(row, col, a):
            if a == LEFT:
                col = max(col - 1, 0)
            elif a == DOWN:
                row = min(row + 1, nrow - 1)
            elif a == RIGHT:
                col = min(col + 1, ncol - 1)
            elif a == UP:
                row = max(row - 1, 0)
            return row, col

        for row in range(nrow):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
                    li = P[s][a]
                    letter = desc[row, col]
                    if letter in b'GH':
                        li.append((1.0, s, 0, True))
                    else:
                        newrow, newcol = inc(row, col, a)
                        newstate = to_s(newrow, newcol)
                        newletter = desc[newrow, newcol]
                        done = bytes(newletter) in b'GH'
                        rew = float(newletter == b'G')
                        li.append((1.0, newstate, rew, done))

        self.P = P
        self.isd = isd
        self.lastaction = None  # for rendering
        self.nS = nS
        self.nA = nA

        self.action_space = spaces.Discrete(self.nA)
        self.observation_space = spaces.Discrete(self.nS)

        self.seed()
        self.s = categorical_sample(self.isd, self.np_random)

        super(FL, self).__init__(nS, nA, P, isd)
示例#9
0
 def __init__(self, size=10, p=0.8):
     self.name = 'frozenlake'
     self.size = size
     random_map = generate_random_map(size=size, p=p)
     self.env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=True)
     self.env.seed(123)
     self.env.action_space.np_random.seed(123)
     self.env._max_episode_steps = 20000
     self.prob = self.probability_matrix()
     self.rewards = self.rewards_matrix()
     self.env.render()
示例#10
0
def get_env(size,
            p=(1 - 1.0 / 40),
            one_hot_obs=True,
            neg_dead_rew=True,
            is_slippery=True):
    random_map = generate_random_map(size=size, p=p)
    env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=is_slippery)
    if neg_dead_rew:
        env = NegativeOnDeadWrapper(env)
    if one_hot_obs:
        env = Int2OneHotWrapper(env)
    return env
示例#11
0
    def __init__(self):
        random_map = generate_random_map(size=8, p=0.8)
        self.env = gym.make("FrozenLake-v0", is_slippery=True, desc=random_map)
        self.env.reset()

        self.epsilon = INITIAL_EPSILON
        self.learning_rate = INITIAL_LEARNING_RATE

        self.action_space = self.env.action_space.n
        self.state_space = [self.env.observation_space.n]
        self.q_table = np.random.uniform(low=-2,
                                         high=0,
                                         size=(self.state_space +
                                               [self.action_space]))
示例#12
0
    def __init__(self, desc=None, map_name="4x4", is_slippery=True):
        if desc is None and map_name is None:
            desc = generate_random_map()
        elif desc is None:
            desc = copy.deepcopy(MAPS[map_name])

        self.initial_desc = copy.deepcopy(desc)

        self.map_name = map_name
        self.desc = desc = np.asarray(desc, dtype='c')
        self.nrow, self.ncol = desc.shape
        self.reward_range = (0, 1)
        self.max_episode_steps = 100 if map_name == '4x4' else 1000
        self.num_steps = 0
        self.done = False
        self.successful_attack = False

        self.lastaction_a = None
        self.lastaction_d = None
        self.lastplayer = None

        self.nA_a = 4
        self.nS_a = self.nrow * self.ncol

        self.goal = np.nan
        self.holes = []
        for row in range(self.nrow):
            for col in range(self.ncol):
                s = self.to_s(row, col)
                letter = desc[row, col]
                if letter in b'H':
                    self.holes.append(s)
                if letter in b'G':
                    self.goal = s

        # Basically max_position**num_hole
        self.nS_d = np.sum([((self.ncol * self.nrow) - 1)**(j + 1)
                            for j in range(len(self.holes))])
        self.nA_d = len(self.holes) * 4

        self.action_space_a = spaces.Discrete(self.nA_a)
        self.observation_space_a = spaces.Discrete(self.nS_a)

        self.action_space_d = spaces.Discrete(self.nA_d)
        self.observation_space_d = spaces.Discrete(self.nS_d)

        self.seed()
        self.s_a = 0
        self.s_d = self.to_s_d()
示例#13
0
def run_experiments_part1():
    try:
        os.makedirs('report/images', exist_ok=True)
        print("Directory created successfully.")
    except OSError as error:
        print("Directory '%s' can not be created")
    print('STARTING EXPERIMENTS')
    env = gym.make("FrozenLake-v0")
    Frozen_Lake_Experiments(env=env, environment="FrozenLake-v0")

    random_map = generate_random_map(size=40, p=0.8)
    env = gym.make("FrozenLake-v0", desc=random_map)
    Frozen_Lake_Experiments(env=env, environment="FrozenLake-40x40")

    Taxi_Experiments()
    print('END OF EXPERIMENTS')
示例#14
0
def custom_frozen_lake(size=8, p=0.8, nondeterministic=False):
    """
    Create a custom-sized frozen-lake environment

    :param size: size x size lake
    :param p: probability of creating frozen tile
    :return: environment

    based on:
        https://reinforcementlearning4.fun/2019/06/24/create-frozen-lake-random-maps/
    """
    random_map = generate_random_map(size=size, p=p)
    fl = gym.make('FrozenLake-v0',
                  desc=random_map,
                  is_slippery=nondeterministic)
    return fl
示例#15
0
    def __init__(self):
        random_map = generate_random_map(size=6, p=0.8)
        self.env = gym.make("FrozenLake-v0", is_slippery=True, desc=random_map)
        self.env.reset()

        self.state_space = self.env.observation_space.n
        self.action_space = self.env.action_space.n

        self.epsilon = INITIAL_EPSILON

        # Main model
        self.model = self._create_model()

        # Target network
        self.target_model = self._create_model()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0
示例#16
0
def myFrozenLake(size=8,
                 randomMap=True,
                 slippery=False,
                 rewarding=False,
                 equiProbable=True,
                 frozenProb=0.9,
                 seed=RS):
    setMySeed(seed)
    if randomMap:
        map = generate_random_map(size, frozenProb)
        env_name = "MyFrozenLakeMap_size_{}_seed_{}-v0".format(size, seed)
        deleteEnvironment(env_name)
        joblib.dump(map, env_name)
        gym.envs.register(id=env_name,
                          entry_point='gymEnvs:MyFrozenLakeEnv',
                          kwargs={
                              'desc': map,
                              'is_slippery': slippery,
                              'rewarding': rewarding,
                              'equiProbable': equiProbable
                          },
                          max_episode_steps=size**4,
                          reward_threshold=0.78)
        return gym.make(env_name)
    else:
        env_name = "MyFrozenLakeMapCustom-v0".format(size, seed)
        deleteEnvironment(env_name)
        gym.envs.register(id=env_name,
                          entry_point='gymEnvs:MyFrozenLakeEnv',
                          kwargs={
                              'map_name': '20x20',
                              'is_slippery': slippery,
                              'rewarding': rewarding,
                              'equiProbable': equiProbable
                          },
                          max_episode_steps=1000,
                          reward_threshold=0.78)
        return gym.make(env_name)
def getEnv(env_id='default',
           rH=0,
           rG=1,
           rF=0,
           size=4,
           map_name='4x4',
           is_slippery=True,
           render_initial=True,
           desc=None):

    if env_id in gym.envs.registry.env_specs:
        del gym.envs.registry.env_specs[env_id]

    nap_desc = frozen_lake.generate_random_map(size) if not desc else desc

    register(
        id=env_id,  # name given to this new environment
        entry_point='my_env:CustomizedFrozenLake',  # env entry point
        kwargs={
            'rH': rH,
            'rG': rG,
            'rF': rF,
            'desc': nap_desc,
            'map_name': map_name,
            'is_slippery': is_slippery
        }  # argument passed to the env
    )

    this_env = make(env_id)
    this_env.seed(5)

    if render_initial:
        print('--Board--')
        this_env.render()
        print('\n--Actions for Position to the Left of the Goal--')
        pprint(this_env.P[this_env.nS - 2])

    return this_env
示例#18
0
    def __init__(self,
                 map_size=30,
                 map_prob=0.9,
                 is_slippery=True,
                 alt_reward=True):
        desc = generate_random_map(size=map_size, p=map_prob)
        self.desc = desc = np.asarray(desc, dtype='c')
        self.nrow, self.ncol = nrow, ncol = desc.shape
        self.actions_symbols = {0: "◄", 1: "▼", 2: "►", 3: "▲"}
        self.actions_symbols2 = {0: "←", 1: "↓", 2: "→", 3: "↑"}
        self.actions_text = {0: "left", 1: "down", 2: "right", 3: "up"}

        if alt_reward:
            self.reward_range = (-100, 100)
        else:
            self.reward_range = (0, 1)

        nA = 4
        nS = nrow * ncol

        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        P = {s: {a: [] for a in range(nA)} for s in range(nS)}

        def to_s(row, col):
            return row * ncol + col

        def inc(row, col, a):
            if a == LEFT:
                col = max(col - 1, 0)
            elif a == DOWN:
                row = min(row + 1, nrow - 1)
            elif a == RIGHT:
                col = min(col + 1, ncol - 1)
            elif a == UP:
                row = max(row - 1, 0)
            return row, col

        for row in range(nrow):
            for col in range(ncol):
                s = to_s(row, col)
                for a in range(4):
                    li = P[s][a]
                    letter = desc[row, col]
                    if letter in b'GH':
                        li.append((1.0, s, 0, True))
                    else:
                        if is_slippery:
                            for b in [(a - 1) % 4, a, (a + 1) % 4]:
                                newrow, newcol = inc(row, col, b)
                                newstate = to_s(newrow, newcol)
                                newletter = desc[newrow, newcol]
                                done = bytes(newletter) in b'GH'
                                if newletter == b'G':
                                    rew = 100.0 if alt_reward else 1.0
                                elif newletter == b'H':
                                    rew = -100.0 if alt_reward else 0.0
                                else:
                                    rew = -1.0 if alt_reward else 0.0
                                li.append((1.0 / 3.0, newstate, rew, done))
                        else:
                            newrow, newcol = inc(row, col, a)
                            newstate = to_s(newrow, newcol)
                            newletter = desc[newrow, newcol]
                            done = bytes(newletter) in b'GH'
                            if newletter == b'G':
                                rew = 100.0 if alt_reward else 1.0
                            elif newletter == b'H':
                                rew = -100.0 if alt_reward else 0.0
                            else:
                                rew = -1.0 if alt_reward else 0.0
                            li.append((1.0, newstate, rew, done))

        super(FrozenLakeModified, self).__init__(nS, nA, P, isd)
示例#19
0
    plt.ylabel('Average Reward')
    if episode is None:
        f_name = 'base.png'
    else:
        f_name = 'output/{}-{}-episode.png'.format(step, episode)
    plt.savefig(f_name, format='png')
    plt.clf()


start = time.time()
# Environment initialization
folder = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                      'q_learning')
folder2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'output')

random_map = generate_random_map(size=64, p=0.95)
env = gym.wrappers.Monitor(gym.make('FrozenLake8x8-v0',
                                    desc=random_map,
                                    map_name=None),
                           folder,
                           force=True)

# Q and rewards
if parent_step is None:
    # Q = np.zeros((env.observation_space.n, env.action_space.n))
    Q = np.random.rand(env.observation_space.n, env.action_space.n) * 0.0001
else:
    Q = np.loadtxt("output/{}.csv".format(parent_step), delimiter=',')
rewards = []
iterations = []
示例#20
0
def learning_experiments():
    policy_iteration_times = np.zeros((1000, 10))
    n_iterations = np.zeros((1000, 10))
    for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
        for states in range(2, 1000):
            P, R = example.forest(S=states)
            pi = mdp.mdp.PolicyIteration(P, R, gamma, max_iter=10000)
            pi.run()
            policy_iteration_times[states, i] = pi.time
            n_iterations[states, i] = pi.iter

    np.save(f'{PATH}/policy_iteration_times_forest.npy',
            policy_iteration_times)
    np.save(f'{PATH}/policy_iteration_n_iter_forest.npy', n_iterations)

    # In[96]:

    value_iteration_times = np.zeros((1000, 10, 10))
    n_iterations = np.zeros((1000, 10, 10))
    for j, epsilon in enumerate(np.linspace(0.1, 0.99, 10)):
        for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
            for states in range(2, 1000):
                P, R = example.forest(S=states)
                pi = mdp.mdp.ValueIteration(P,
                                            R,
                                            discount=gamma,
                                            max_iter=10000,
                                            epsilon=epsilon)
                pi.run()
                value_iteration_times[states, i, j] = pi.time
                n_iterations[states, i, j] = pi.iter

    np.save(f'{PATH}/value_iteration_times_forest.npy', value_iteration_times)
    np.save(f'{PATH}/value_iteration_n_iter_forest.npy', n_iterations)

    # In[108]:

    Q_iteration_times = np.zeros((1000, 10))
    n_iterations = np.zeros((1000, 10))
    for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)):
        for states in range(2, 1000):
            P, R = example.forest(S=states)
            pi = mdp.mdp.QLearning(P, R, discount=gamma, n_iter=10000)
            pi.run()
            Q_iteration_times[states, i] = pi.time
            n_iterations[states, i] = pi.mean_discrepancy

    np.save(f'{PATH}/Q_iteration_times_forest.npy', Q_iteration_times)
    np.save(f'{PATH}/Q_iteration_n_iter_forest.npy', n_iterations)

    # ## MDP 2: FrozenLake

    # In[98]:
    # In[109]:

    from gym.envs.toy_text.frozen_lake import generate_random_map

    Q_iteration_times = np.zeros((100, 10, 10))
    Q_rewards = np.zeros((100, 10, 10))

    value_n_iterations = np.zeros((100, 10, 10))
    policy_n_iterations = np.zeros((100, 10, 10))
    total_states = np.zeros(100)
    for size in range(2, 100, 5):
        for i, gamma in enumerate(np.linspace(0, 1, 10)):
            for j, epsilon in enumerate(np.linspace(0, 1, 10)):
                random_map = generate_random_map(size=size, p=0.8)
                environment = gym.make('FrozenLake-v0', desc=random_map)
                test = QLearner(0.1, gamma, epsilon, verbose=False)
                start = time.time()
                n = test.learn(50)
                Q_iteration_times[size, i, j] = time.time() - start
                Q_rewards[size, i, j] = n[-1]

    np.save(f'{PATH}/Q_iteration_times_grid.npy', Q_iteration_times)
    np.save(f'{PATH}/Q_iteration_rewards_grid.npy', Q_rewards)

    # In[106]:

    value_iteration_times = np.zeros((100, 10))
    policy_iteration_times = np.zeros((100, 10))

    value_n_iterations = np.zeros((100, 10))
    policy_n_iterations = np.zeros((100, 10))
    total_states = np.zeros(100)
    for size in range(2, 100, 5):
        for i, gamma in enumerate(np.linspace(0, 1, 10)):
            random_map = generate_random_map(size=size, p=0.8)
            environment = gym.make('FrozenLake-v0', desc=random_map)
            total_states[size] = environment.nS
            agent = BasicLearner(environment, environment.nS, environment.nA,
                                 5000, gamma)
            start = time.time()
            opt_v2, opt_policy2, value_iter = agent.value_iteration()
            value_iteration_times[size, i] = time.time() - start
            value_n_iterations[size, i] = value_iter

            start = time.time()
            opt_v2, opt_policy2, policy_iter = agent.policy_iteration()
            policy_iteration_times[size, i] = time.time() - start
            policy_n_iterations[size, i] = policy_iter

    np.save(f'{PATH}/num_states_grid.npy', total_states)
    np.save(f'{PATH}/policy_iteration_times_grid.npy', policy_iteration_times)
    np.save(f'{PATH}/value_iteration_times_grid.npy', value_iteration_times)
    np.save(f'{PATH}/value_iteration_n_iter_grid.npy', value_n_iterations)
    np.save(f'{PATH}/policy_iteration_n_iter_grid.npy', policy_n_iterations)
示例#21
0
文件: main.py 项目: Cphrampus/CS7641
import numpy as np
from hiive.mdptoolbox import mdp, example
import matplotlib
matplotlib.use("TKAgg")

import matplotlib.pyplot as plt
import re

if __name__ == '__main__':
    start = time.time()
    np.random.seed(0)

    # TODO probably should make multiple sizes, will make tuning ql a pain because so many iterations will be needed
    # grid world, frozen lake, small 225 states
    lake_size = 15
    random_map = generate_random_map(size=lake_size, p=0.8)

    env = gym.make("FrozenLake-v0", desc=random_map)
    env.reset()
    env.render()

    num_states = len(env.env.P)
    num_actions = len(env.env.P[0])

    transitions = np.zeros((num_actions, num_states, num_states))
    rewards = np.zeros((num_states, num_actions))

    # convert transition matrix dict of dicts of lists to rewards matrix
    # frozen lake has a mostly 0 matrix, might be worth looking at sparse if it gets really big
    for state in env.env.P:
        for action in env.env.P[state]:
示例#22
0
#    "FFFFFHFFFFFFFFFFHHFFFHFFFFFFFF",
#    "FFFFFFFFFFFHHFFFFFHHHFFHHFFFFF",
#    "HHHHFHFFFFFFFFFFHHFFFFFFFFFFFF",
#    "FFFFFHFFFFHHHFFFFFFFFFFFHFFFFF",
#    "FFFFFFFFFFFFFFFFHHFFFHFFFFFFFF",
#    "FFFFFHFFFFFFHFFFHHFFFFHHFFFFFF",
#    "FFFFFHFFFFFFFFFFHHFFFFFFFFHFFF",
#    "FFFFFFFFFFFHFFFFFFFFFFFFFFFFFF",
#    "FHHFFFHFFFFHFFFFFHFFFFHHFFFFFF",
#    "FHHFHFHFFFFFFFFFFFFFFFFFFFFFFF",
#    "FFFHFFFFFHFFFFHHFHFHFFFHHHHFFG"
#]

#env = gym.make('FrozenLake-v0')

custom_map = generate_random_map(size=30, p=0.8)

env = gym.make("FrozenLake-v0", desc=custom_map)

# from https://learning.oreilly.com/library/view/hands-on-reinforcement-learning/9781788836524/e8ad36d5-21fe-442f-8133-3cee6bf31b2e.xhtml


def value_iteration(env, gamma=1.0):
    aa = []

    value_table = np.zeros(env.observation_space.n)
    no_of_iterations = 100000
    threshold = 1e-10
    for i in range(no_of_iterations):
        print(i)
        updated_value_table = np.copy(value_table)
示例#23
0
def run_fl(size):
    seed_val = 42
    np.random.seed(seed_val)
    random.seed(seed_val)
    if size == 4:
        env = gym.make("FrozenLake-v0")
    else:
        seed_val = 58
        np.random.seed(seed_val)
        random.seed(seed_val)
        dim = size
        random_map = generate_random_map(size=dim, p=0.8)

        env = gym.make("FrozenLake-v0", desc=random_map)
    env.seed(seed_val)
    env.reset()
    # env.render()

    # env = gym.make('FrozenLake8x8-v0')
    env = env.unwrapped

    nA = env.action_space.n
    nS = env.observation_space.n

    best_V = ''
    best_won = -1
    best_policy = []

    gammas = [0.1, 0.3, 0.4, 0.7, 0.9, 0.99]
    epsilons = [0.1,
                0.01,
                0.001,
                0.0001,
                0.00001,
                0.000001,
                0.0000001,
                0.00000001,
                0.000000001,
                0.0000000001]

    gammas = [0.3]
    epsilons = [0.0001]

    per_won_hm = np.zeros((len(gammas), len(epsilons)))
    iters_hm = np.zeros((len(gammas), len(epsilons)))
    time_hm = np.zeros((len(gammas), len(epsilons)))

    g_cnt = 0
    e_cnt = 0
    best_e = 0
    best_g = 0
    for g in gammas:
        e_cnt = 0
        for e in epsilons:
            if g >= 0.99 and e <= 0.001:
                per_won_hm[g_cnt][e_cnt] = 0
                iters_hm[g_cnt][e_cnt] = 0
                time_hm[g_cnt][e_cnt] = 0
            else:
                start = time.time()
                V = np.zeros(nS)
                policy = np.zeros(nS)
                policy_stable = False
                it = 0

                while not policy_stable:
                    policy_evaluation(env, V, policy, nS, e, g)
                    policy_stable = policy_improvement(env, V, policy, nA, nS, g)
                run_time = time.time() - start
                per_won = run_pi_episodes(env, V, policy, 10)

                per_won_hm[g_cnt][e_cnt] = per_won
                iters_hm[g_cnt][e_cnt] = it
                time_hm[g_cnt][e_cnt] = run_time * 1000
                print(g, e, it, per_won)
                if per_won > best_won:
                    best_e = e
                    best_g = g
                    best_V = V
                    best_policy = policy
                    best_won = per_won
            e_cnt += 1
        g_cnt += 1

    # Plot Percent Games Won Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(per_won_hm, gammas, epsilons, ax=ax,
                       cmap="YlGn", cbarlabel="% Games Won")
    texts = annotate_heatmap(im, valfmt="{x:.2f}")

    fig.tight_layout()
    plt.savefig('Images\\PI-FL-Per_Heatmap' + str(size) + '.png')
    plt.show()

    # Plot Iterations Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(iters_hm, gammas, epsilons, ax=ax,
                       cmap="YlGn", cbarlabel="# of Iterations to Convergence")
    texts = annotate_heatmap(im, valfmt="{x:.0f}")

    fig.tight_layout()
    plt.savefig('Images\\PI-FL-Iter_Heatmap' + str(size) + '.png')
    plt.show()

    # Plot Run time Heatmap
    fig, ax = plt.subplots()

    im, cbar = heatmap(time_hm, gammas, epsilons, ax=ax,
                       cmap="YlGn", cbarlabel="Runtime (ms)")
    texts = annotate_heatmap(im, valfmt="{x:.0f}")

    fig.tight_layout()
    plt.savefig('Images\\PI-FL-Time_Heatmap' + str(size) + '.png')
    plt.show()

    # Plot Optimal state values with directions

    plot_values(V, best_policy, size)

    print(best_V.reshape((size, size)))
    print(best_policy.reshape((size, size)))
    print(best_e, best_g, best_won)
            return policy, V_ARR, V_SUM


np.random.seed(1111)

#
# Different sizes
#

N_ITERS = []
SIZE = np.arange(10, 40, 1)
TIME_ARR = []
for size in SIZE:
    print(size)
    np.random.seed(1111)
    random_map = generate_random_map(size=size, p=0.85)
    env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=False)
    env.reset()
    time0 = time.time()
    policy, V_ARR, V_SUM = policy_improvement(env, discount_factor=0.99)
    time1 = time.time()
    N_ITERS.append(len(V_SUM))
    TIME_ARR.append(time1 - time0)

fig, ax = plt.subplots()
ax.plot(SIZE,
        N_ITERS,
        color='red',
        label="Number of iterations",
        linewidth=2.0,
        linestyle='-')
示例#25
0
import gym
from gym import utils
import sys
import numpy as np
from gym.envs.toy_text.frozen_lake import generate_random_map

m = generate_random_map(8)

m_small = [
    "SHFHFHFF",
    "FFHFFFFF",
    "FFFFFFFF",
    "FFHFFFFF",
    "FFHHHFHF",
    "FFFFFFHF",
    "HHFFFFHF",
    "FHFHFFFG",
]

m_large = [
    "SFHFFFFFFFFFFFHFHFFFFFFFFFHFFFFF",
    "FFFFFFHFFFFFFFFFHFFHFFFFFFFFFFHF",
    "HFFFFFFFHFFFFFHHFHFFFFFFFFFFFFHF",
    "FFFFFFFFFFFFFFHFFFFFFFFHFFFHHFFF",
    "HFFFFHFFFFFFHFHFFFHFHFFFFFFFHFFF",
    "HFFFFFFHFFHFFFFFFFFFFFFFFFFFFFFF",
    "FFFFFFFFFFFHHHFFHFFFFFFFFFFFFFHF",
    "FFFFFFFHHFFFFFHFFFFFFFFFFHFFFHFF",
    "FFFFFFFFFFFFHFFFFHFFFHFFFFFFFFHH",
    "HFFFFFFFFFFFFFFFHFHFFFFFFFFFFFHF",
    "HFFFFFFFFFFFFHFFHFFHFFFFFFFFHHHF",
示例#26
0
import mdp_copy
import numpy as np
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map
from matplotlib import pyplot as plt
import seaborn as sns

ACTION_MAP = ['<', 'V', '>', '^']

if __name__ == '__main__':
    np.random.seed(300)
    grid_size = 30
    random_map = generate_random_map(size=grid_size)
    env = gym.make("FrozenLake-v0", desc=random_map)
    action_space = env.action_space.n
    observation_space = env.observation_space.n
    T = np.zeros((action_space, observation_space, observation_space))
    R = np.zeros((observation_space))
    for state in env.env.P.keys():
        choices = env.env.P[state]
        for action in choices.keys():
            outcomes = choices[action]
            for outcome in outcomes:
                prob, next_state, reward, terminal = outcome
                T[action][state][next_state] += prob
                if not terminal or state != next_state:
                    R[next_state] = reward
    R.reshape((grid_size, grid_size))

    ### 0.9 discount
示例#27
0
                        size=14)
            else:
                ax.text(j,
                        i,
                        mapping[actions[i, j]],
                        ha='center',
                        va='center',
                        color='k',
                        size=12)
    fig.tight_layout()
    plt.show()


if __name__ == '__main__':
    env_name = 'FrozenLake8x8-v0'
    new_map = generate_random_map(size=30, p=0.9)
    conveged_at_determined = []
    policy_average_score_determined = []
    conveged_at_stochastic = []
    policy_average_score_stochastic = []
    gamma_list = []

    gamma_range = np.arange(0.1, 1.0, 0.1)

    for i in gamma_range:

        gamma = 1 - i / 100
        gamma_list.append(gamma)
        """ deterministic """
        env1 = gym.make(env_name, is_slippery=False)
        env1.seed(3006)
示例#28
0
def sixteen_by_sixteen_map():
    return frozen_lake.generate_random_map(size=16)
示例#29
0
import numpy as np
import gym
from gym import wrappers
from gym.envs.toy_text.frozen_lake import generate_random_map
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
import random
import pandas as pd
import time

random_map = generate_random_map(size=20, p=0.8)


def run_episode(env, policy, gamma=1.0, render=False):
    """ Runs an episode and return the total reward """
    obs = env.reset()
    total_reward = 0
    step_idx = 0
    while True:
        if render:
            env.render()
        obs, reward, done, _ = env.step(int(policy[obs]))
        #print "Reward received for each action"
        #print reward
        total_reward += (gamma**step_idx * reward)
        #print "Total reward inside whileloop"
        #print total_reward
        step_idx += 1
        if done:
            break
示例#30
0
import statistics
import random
import numpy as np
import pandas as pd
import gym
from gym.envs.toy_text.frozen_lake import generate_random_map
import matplotlib.pyplot as plt

seed = 0
# Init PRNG
random.seed(seed)
np.random.seed(seed)

evnt = 'FrozenLake-v0'
size = 8
rndm = generate_random_map(size)
env = gym.make(evnt, desc=rndm)
env.seed(0)

import tensorflow as tf
tf.random.set_seed(seed)

from collections import deque
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


def epsilon_greedy(model, state, epsilon, num_actions):
    greedy = np.random.uniform(0, 1)
示例#31
0
import gym
import numpy as np
import tensorflow as tf
from gym.envs.toy_text.frozen_lake import generate_random_map

size = 40
random_map = generate_random_map(size=size, p=0.88)
env = gym.make("FrozenLake-v0", desc=random_map)

#reward list, used to evaluate the agents performance
rList = []

#used for experience replay
experiences = []


def add_state_to_memory(state, reward):
    mem = {state: state, reward: reward}

    experiences.append(mem)


def shape_reward(current_reward, current_state, done):
    if current_reward == 1:
        return current_reward

    if done and current_reward == 0:
        return -1.0

    return current_reward