示例#1
0
def policy_iteration():
    deltas = {}
    rewards = {}
    for size in PROBLEM_SIZES:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        pi = PolicyIteration(P, R, 0.9, max_iter=10)
        pi.run()
        delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))]
        reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))]
        deltas[size] = delta
        rewards[size] = reward
        print(pi.policy)
        print(pi.S)

    # forest_plot.plot_pi_forest_convergence_size(rewards)

    deltas = {}
    rewards = {}
    for p in [.2, .1, .05, .01]:
        P, R = forest(S=10, r1=1, r2=5, p=p)
        pi = PolicyIteration(P, R, 0.9, max_iter=10)
        pi.run()
        delta = [pi.run_stats[i]['Error'] for i in range(len(pi.run_stats))]
        reward = [pi.run_stats[i]['Reward'] for i in range(len(pi.run_stats))]
        deltas[p] = delta
        rewards[p] = reward
        print(pi.policy)
        print(pi.S)

    forest_plot.plot_pi_forest_convergence_p(rewards)
示例#2
0
    def process_environment(self, config, tune_param=0):
        """ Process instances for problem environment """
        print(f"Processing inputs for {self.type} in {self.environment}")

        # Generate environment class
        env_class = Environment(self.name, self.environment, tune_param)

        # Populate with instance
        if self.environment == "forest":
            # Generate fire management problem
            env_class.env = forest(S=config['states'],
                                   r1=config['reward1'],
                                   r2=config['reward2'],
                                   p=config['prob'],
                                   is_sparse=config['is_spare'])
            env_class.transition = env_class.env[0]
            env_class.reward = env_class.env[1]
            self.states = config['states']

        else:
            # Create FrozenLake and process matrices
            lake = frozen_lake.generate_random_map(size=config['size'],
                                                   p=config['prob'])
            env_class.env = frozen_lake.FrozenLakeEnv(lake)
            env_class.process_matrices()
            self.states = config['size']**2

        return env_class
示例#3
0
def gamma_iter_value():
    gamma = np.arange(0.1, 1.0, 0.1)
    v1_iter = []
    v2_iter = []
    v1_v_mean = []
    v2_v_mean = []
    for g in gamma:
        P, R = forest(num_states, r1, r2, p_fire)
        P2, R2 = forest(num_states, r1, r2, 0.9)
        vi = ValueIteration(P, R, g, 1e-20)
        vi.run()

        vi2 = ValueIteration(P2, R2, g, 1e-20)
        vi2.run()
        v1_iter.append(len(vi.run_stats))
        v2_iter.append(len(vi2.run_stats))
        v1_v_mean.append(vi.run_stats[-1]["Mean V"])
        v2_v_mean.append(vi2.run_stats[-1]["Mean V"])

    # plt.plot(gamma, v1_iter, linestyle='--', marker='o', color='b',label="fire possibility = 0.1")
    # plt.plot(gamma, v2_iter, linestyle='--', marker='o', color='r',label="fire possibility = 0.9")
    # plt.xlabel("Gamma")
    # plt.ylabel("Converged iteration #")
    # plt.title("converged happen at iteration # vs gamma")
    # plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'), loc="upper left")
    # plt.show()

    plt.plot(gamma,
             v1_v_mean,
             linestyle='--',
             marker='o',
             color='b',
             label="fire possibility = 0.1")
    plt.plot(gamma,
             v2_v_mean,
             linestyle='--',
             marker='o',
             color='r',
             label="fire possibility = 0.9")
    plt.xlabel("Gamma")
    plt.ylabel("Converged Mean Value")
    plt.title("converged Mean Value vs gamma")
    plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'),
               loc="upper left")
    plt.show()
示例#4
0
def value_iteration():
    deltas = {}
    rewards = {}
    for size in PROBLEM_SIZES:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        vi = ValueIteration(P, R, 0.9, max_iter=10)
        vi.run()
        delta = [vi.run_stats[i]['Error'] for i in range(len(vi.run_stats))]
        reward = [vi.run_stats[i]['Reward'] for i in range(len(vi.run_stats))]
        deltas[size] = delta
        rewards[size] = reward
        print(vi.policy)
        print(vi.S)
示例#5
0
def qlearning():
    deltas = {}
    rewards = {}
    for size in [10, 20, 40, 80]:
        P, R = forest(S=size, r1=1, r2=5, p=.1)
        ql = QLearning(P, R, 0.90, epsilon_decay=.998)
        ql.run()
        delta = [ql.run_stats[i]['Error'] for i in range(len(ql.run_stats))]
        reward = [ql.run_stats[i]['Reward'] for i in range(len(ql.run_stats))]
        epilson = [
            ql.run_stats[i]['Epsilon'] for i in range(len(ql.run_stats))
        ]
        deltas[size] = delta
        rewards[size] = reward
        print(ql.policy)

    forest_plot.plot_ql_forest_convergence_size(deltas)
示例#6
0
def run(verbose=False):
    # MDP Forest Problem
    # transitions, reward = example.forest()
    nS = 1000
    # transitions, reward = example.forest(S=nS, r1=250, r2=120, p=0.01, is_sparse=False)
    transitions, reward = example.forest(S=nS,
                                         r1=1045,
                                         r2=1025,
                                         p=0.01,
                                         is_sparse=False)

    # print(transitions)
    # print (reward)
    # return
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    pi = mdp.PolicyIteration(transitions, reward, 0.75, max_iter=10000)
    if verbose:
        pi.setVerbose()
    pi.run()
    util.print_debugs(pi)
    # print(pi.run_stats)
    # return

    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    vi = mdp.ValueIteration(transitions, reward, 0.75, max_iter=100000)
    if verbose:
        vi.setVerbose()
    vi.run()
    util.print_debugs(vi)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    # transitions, reward, gamma,
    #  alpha=0.1, alpha_decay=0.99, alpha_min=0.001,
    #  epsilon=1.0, epsilon_min=0.1, epsilon_decay=0.99,
    #  n_iter=10000, skip_check=False, iter_callback=None,
    #  run_stat_frequency=None):

    ql = mdp.QLearning(transitions,
                       reward,
                       0.75,
                       alpha=0.3,
                       epsilon_min=0.005,
                       n_iter=500000)
    if verbose:
        ql.setVerbose()
    start_t = time.process_time()
    ql.run()
    end_t = time.process_time()

    # Output
    print('~~~~~~~~~~ Forest - Policy Iteration ~~~~~~~~~~')
    util.print_debugs(pi)
    print('~~~~~~~~~~ Forest - Value Iteration ~~~~~~~~~~')
    util.print_debugs(vi)
    print('~~~~~~~~~~ Forest - Q-Learning ~~~~~~~~~~')
    print(ql.policy)
    print('Q-Learning # of Iterations: %i' % q_counter)
    print('Clock time')
    print(end_t - start_t)

    if (vi.policy == pi.policy):
        print('Forest - Value and Policy Iteration policies are the same! ')
    else:
        print('Forest - Value and Policy Iteration policies are NOT the same.')

    if (vi.policy == ql.policy):
        print('Forest – QL and VI Policies are the same!')
    else:
        print('Forest – QL and VI Policies are NOT the same.')
    if (pi.policy == ql.policy):
        print('Forest – PI and PI Policies are the same!')
    else:
        print('Forest – PI and VI Policies are NOT the same.')

    # A Q-Learning Algorithm
    #
    # Source:
    #   https://www.oreilly.com/radar/introduction-to-reinforcement-learning-and-openai-gym/
    """
示例#7
0
policy_all = {}
vi_res = {
    "Iteration to converge": [],
    "Time to converge": [],
    "Max V": [],
    "Mean V": [],
}
pi_res = {
    "Iteration to converge": [],
    "Time to converge": [],
    "Max V": [],
    "Mean V": [],
}
for s in N_STATES:
    print(f"Running nS {s}...")
    P, R = forest(S=s, p=0.001, r1=100, r2=10)
    vi = ValueIteration(P, R, gamma=0.99, epsilon=0.001)
    vi.run()
    vi_res["Iteration to converge"].append(vi.iter)
    vi_res["Time to converge"].append(vi.time)
    vi_res["Max V"].append(vi.run_stats[-1]["Max V"])
    vi_res["Mean V"].append(vi.run_stats[-1]["Mean V"])
    V_all[("vi", s)] = vi.V
    policy_all[("vi", s)] = vi.policy
    pi = PolicyIteration(P, R, gamma=0.99, eval_type=1, max_iter=1000)
    pi.run()
    pi_res["Iteration to converge"].append(pi.iter)
    pi_res["Time to converge"].append(pi.time)
    pi_res["Max V"].append(pi.run_stats[-1]["Max V"])
    pi_res["Mean V"].append(pi.run_stats[-1]["Mean V"])
    V_all[("pi", s)] = pi.V
示例#8
0
 def init_forest(self):
     return mdpExm.forest(self.states, self.reward_wait, self.reward_cut, self.prob_fire)
示例#9
0
             label="fire possibility = 0.9")
    plt.xlabel("Gamma")
    plt.ylabel("Converged Mean Value")
    plt.title("converged Mean Value vs gamma")
    plt.legend(('fire possibility = 0.1', 'fire possibility = 0.9'),
               loc="upper left")
    plt.show()


if __name__ == '__main__':
    gamma = 0.9
    num_states = 20
    r1 = 4
    r2 = 2
    p_fire = 0.1
    P, R = forest(num_states, r1, r2, p_fire)
    vi = ValueIteration(P, R, 0.96, 1e-20)
    vi.run()

    P2, R2 = forest(num_states, r1, r2, 0.8)
    vi2 = ValueIteration(P2, R2, 0.96, 1e-20)
    vi2.run()

    # # calculate and plot the v_mean
    # iter_score(vi, vi2)

    # gamma_iter_value()
    # #
    #

    pi = PolicyIteration(P, R, 0.96)
示例#10
0
if __name__ == '__main__':
    if not os.path.isdir('figures/'):
        os.mkdir('figures')

    y = input('''Choose environment:
                    S=250, r1=10, r2=5: 1,
                    S=1000, r1=15, r2=5: 2: ''')

    x = input('''Choose algorithm:
                    VI: v,
                    PI: p,
                    Q: q: ''')

    if y == '1':
        P, R = forest(S=250, r1=10, r2=5)
        if x == 'v':
            value_iteration(P, R, id='250')
        elif x == 'p':
            policy_iteration(P, R, id='250')
        elif x == 'q':
            Q_learner(P, R, id='250')
    elif y == '2':
        P, R = forest(S=1000, r1=15, r2=5)
        if x == 'v':
            value_iteration(P, R, id='1000')
        elif x == 'p':
            policy_iteration(P, R, id='1000')
        elif x == 'q':
            Q_learner(P, R, id='1000')
def run_forest():
    np.random.seed(0)
    P, R = example.forest(S=5, r1=3, r2=15, p=0.2)
    print("Transition Array: ")
    print(P.shape)
    print(P)  # Transition array A x S x S
    print("Reward Array: ")
    print(R.shape)
    print(R)  # Reward array S x A

    # TODO
    gamma_range = np.array([0.1, 0.9, 0.99])
    alpha_range = np.array([0.01, 0.5, 0.99])
    epsilon_range = np.array([0.1, 0.5, 0.95])
    e_decay_range = np.array([0.1, 0.5, 0.999])

    # gamma_range = np.append(np.linspace(0.1, 0.9, 9), np.linspace(0.91, 0.99, 9))
    # alpha_range = np.append(np.linspace(0.01, 0.1, 9), np.linspace(0.2, 0.99, 4))
    # epsilon_range = np.linspace(0.1, 1.0, 10)
    # e_decay_range = np.append(np.linspace(0.1, 0.9, 4), np.linspace(0.91, 0.99, 9))

    difference_list = np.zeros(gamma_range.shape)
    value_iteration_list = np.zeros(gamma_range.shape)
    value_time_list = np.zeros(gamma_range.shape)
    value_reward_list = np.zeros(gamma_range.shape)
    value_error_list = np.zeros(gamma_range.shape)

    policy_iteration_list = np.zeros(gamma_range.shape)
    policy_time_list = np.zeros(gamma_range.shape)
    policy_reward_list = np.zeros(gamma_range.shape)
    policy_error_list = np.zeros(gamma_range.shape)

    for i, gamma in enumerate(gamma_range):
        print('Gamma %0.2f' % gamma)

        vi = mdp.ValueIteration(transitions=P, reward=R, gamma=gamma, epsilon=0.0001, max_iter=10000)
        vi.setVerbose()
        vi.run()
        vi_stats = vi.run_stats
        value_iteration_list[i] = vi_stats[-1:][0]['Iteration']
        value_time_list[i] = vi_stats[-1:][0]['Time']
        value_reward_list[i] = vi_stats[-1:][0]['Reward']
        value_error_list[i] = vi_stats[-1:][0]['Error']
        plot_stats(vi_stats, ('vi_forest_%0.2f' % gamma))

        pi = mdp.PolicyIteration(transitions=P, reward=R, gamma=gamma, max_iter=10000, eval_type=1)
        pi.setVerbose()
        pi.run()
        stats = pi.run_stats
        policy_iteration_list[i] = stats[-1:][0]['Iteration']
        policy_time_list[i] = stats[-1:][0]['Time']
        policy_reward_list[i] = stats[-1:][0]['Reward']
        policy_error_list[i] = stats[-1:][0]['Error']
        plot_stats(stats, ('pi_forest_%0.2f' % gamma))
        print('Policies Found')
        print('Value Iteration: ' + str(vi.policy))
        print('Policy Iteration: ' + str(pi.policy))

        difference1 = sum([abs(x - y) for x, y in zip(pi.policy, vi.policy)])
        difference_list[i] = difference1
        print("Discrepancy in Policy and Value Iteration: ", difference1)
        print()

    # Plotting
    # Error v Iteration
    plt.clf()
    plt.title('Value Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(value_iteration_list), list(value_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_error_v_iteration.png')

    # Reward v Gamma
    plt.clf()
    plt.title('Value Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Value Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(value_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Value Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(value_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Value Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(value_iteration_list), list(value_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/vi_reward_v_iterations.png')

    # Policy
    # Error v Iteration
    plt.clf()
    plt.title('Policy Iteration: Error v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Error')
    plt.plot(list(policy_iteration_list), list(policy_error_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_error_v_iteration.png')

    # Gamma v Reward
    plt.clf()
    plt.title('Policy Iteration: Reward v Gamma')
    plt.xlabel('Gamma')
    plt.ylabel('Reward')
    plt.plot(list(gamma_range), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_gamma.png')

    # Gamma v Iterations
    plt.clf()
    plt.title('Policy Iteration: Gamma v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Gamma')
    plt.plot(list(policy_iteration_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_iterations.png')

    # Gamma v Time
    plt.clf()
    plt.title('Policy Iteration: Gamma v Time')
    plt.xlabel('Time')
    plt.ylabel('Gamma')
    plt.plot(list(policy_time_list), list(gamma_range))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_gamma_v_time.png')

    # Reward vs Iterations
    plt.clf()
    plt.title('Policy Iteration: Reward v Iterations')
    plt.xlabel('Iterations')
    plt.ylabel('Reward')
    plt.plot(list(policy_iteration_list), list(policy_reward_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/pi_reward_v_iterations.png')

    # Gamma vs Policy Differences
    plt.clf()
    plt.title('Gamma v Policy Differences')
    plt.xlabel('Gamma')
    plt.ylabel('Policy Differences')
    plt.plot(list(gamma_range), list(difference_list))
    plt.tight_layout()
    plt.savefig('plots/forest_experiment/gamma_v_differences.png')
    plt.close('all')

    prev_Q = None
    thresh = 1e-4
    print('== Q Learning ==')
    for i, gamma in enumerate(gamma_range):
        for j, alpha in enumerate(alpha_range):
            for k, ep in enumerate(epsilon_range):
                for l, ed in enumerate(e_decay_range):
                    # print('ql: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(gamma, alpha, ep, ed))
                    ql = mdp.QLearning(transitions=P, reward=R, gamma=gamma, alpha=alpha, alpha_decay=1.0, alpha_min=0.001,
                                       epsilon=ep, epsilon_min=0.1, epsilon_decay=ed, n_iter=10e4)
                    stats = ql.run()
                    plot_stats(stats, ('ql_forest_%0.2f_%0.2f_%0.2f_%0.2f' % (gamma, alpha, ep, ed)))

                    # print('Policy: ')
                    # print(ql.policy)
                    # print(ql.run_stats)

                    df = pd.DataFrame.from_records(ql.run_stats)
                    iteration_list = df['Iteration'][-100:]
                    windowed_reward = df['Reward'][-100:].mean()
                    error_list = df['Error'][-100:].mean()

                    if prev_Q is None:
                        prev_Q = ql.Q
                    else:
                        variation = np.absolute(np.subtract(np.asarray(ql.Q), np.asarray(prev_Q))).max()
                        res = np.abs(np.subtract(np.asarray(prev_Q), np.asarray(ql.Q)))
                        print('Result: ')
                        print(res)
                        print('Variation: ')
                        print(variation)
                        print('Mean Reward for Last 100 Iterations:')
                        print(windowed_reward)
                        if np.all(res < thresh) or variation < thresh or windowed_reward > 1.0:
                            print('Breaking! Below Thresh')
                            print('Found at: gamma - {}, alpha - {}, epsilon - {}, e_decay - {}'.format(
                                gamma, alpha, ep, ed))
                            print('Optimal Policy: ')
                            print(ql.policy)
                            break
示例#12
0
from hiive.mdptoolbox import example, mdp

P, R = example.forest()
vi = mdp.ValueIteration(P, R, 0.96)
print(vi.verbose)

vi.run()
expected = (5.93215488, 9.38815488, 13.38815488)
all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected)))

print(vi.policy)

print(vi.iter)

from hiive import mdptoolbox
import numpy as np

P = np.array([[[0.5, 0.5], [0.8, 0.2]], [[0, 1], [0.1, 0.9]]])
R = np.array([[5, 10], [-1, 2]])
vi = mdptoolbox.mdp.ValueIteration(P, R, 0.9)
vi.run()
expected = (40.048625392716815, 33.65371175967546)
all(expected[k] - vi.V[k] < 1e-12 for k in range(len(expected)))

print(vi.policy)

print(vi.iter)

from hiive import mdptoolbox
import numpy as np
from scipy.sparse import csr_matrix as sparse
示例#13
0
# In[1]:


from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import numpy as np
from numpy.random import choice
import pandas as pd
from matplotlib import pyplot as plt


# In[2]:


np.random.seed(100)
P, R = forest(S=500, r1=100, r2= 25, p=0.01)


# In[3]:


def test_policy(P, R, policy, test_count=100, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
示例#14
0
import numpy as np
import pandas as pd
import gym
from hiive.mdptoolbox.example import forest
import matplotlib.pyplot as plt


PROBS = {}

PROBS["forest"] = forest(S=1000, p=0.001, r1=100, r2=10)

env = gym.make("FrozenLake8x8-v0")

nA, nS = env.nA, env.nS
P = np.zeros([nA, nS, nS])
R = np.zeros([nA, nS, nS])
DONE = np.zeros(nS)
for s in range(nS):
    for a in range(nA):
        transitions = env.P[s][a]
        for p_trans, next_s, reward, done in transitions:
            P[a, s, next_s] += p_trans
            R[a, s, next_s] += reward
            DONE[next_s] = done
        P[a, s, :] /= np.sum(P[a, s, :])
PROBS["frozen_lake"] = (P, R)

nA, nS = env.nA, env.nS
P = np.zeros([nA, nS, nS])
R = np.zeros([nA, nS, nS])
DONE = np.zeros(nS)
示例#15
0
import hiive.mdptoolbox.example as example
import hiive.mdptoolbox.mdp as mdp
states = 50

P, R = example.forest(S=states)

#pi = mdp.QLearning(P, R, 0.99, n_iter=500000, alpha=0.3, alpha_min=0.1, epsilon_min=0.1, epsilon_decay=0.9999)
pi = mdp.ValueIteration(P, R, 0.99)

pi.run()
#print("deltas_" + str(gamma)[2:] + " = " + str(pi.deltas))

for x in pi.run_stats:
    print(x)

print(pi.policy)

l = len(pi.run_stats) - 1
print('Time: ', pi.run_stats[l]['Time'], "Reward: ", pi.run_stats[l]['Reward'])
示例#16
0
                          alpha_decay=0.99,
                          epsilon_decay=0.99,
                          n_iter=max_iter).run()
        q_stats.append(q)

    return q_stats, gammas


def vi_pi_q_comp(P, R):
    vi = mdp.ValueIteration(P, R, 0.60, epsilon=0.001).run()
    pi = mdp.PolicyIteration(P, R, 0.60, eval_type=1).run()
    q = mdp.QLearning(P, R, 0.6, alpha=0.2).run()
    return vi, pi, q


P1, R1 = example.forest(10000, p=0.5)
# P1, R1 = example.forest(10000)

env = gym.make('Taxi-v3')
states = env.observation_space.n
actions = env.action_space.n
P2, R2 = build_matrix(env, states, actions)

# VI Gamma
vi_gamma_forest, gamma_forest = vi_experiment_gamma(P1, R1)
vi_gamma_taxi, gamma_taxi = vi_experiment_gamma(P2, R2)
print(vi_gamma_taxi[-2][-1], vi_gamma_forest[-2][-1])
plot_vi_gamma(vi_gamma_taxi, vi_gamma_forest, gamma_taxi, gamma_forest)

# VI Epsilon
vi_epsilon_forest, epsilon_forest = vi_experiment_epsilon(P1, R1)
示例#17
0
#
# A forest is managed by two actions – WAIT and CUT. An action is decided each year with two objectives: maintain the forest for wildlife (i.e. WAIT) or cut the forest for wood (i.e. CUT). The Agent gets a reward of $1 for cutting the forest and going back to Initial State. However, the agent can decide to wait and hope for a better reward and move through states with a probability p in the hope of catching the maximum reward in final state. However, there is a probability of (1-p) that the forest will burn down and the agent will lose the reward as the forest goes back to initial state.

# In[1]:

from hiive.mdptoolbox.mdp import ValueIteration, PolicyIteration, QLearning
from hiive.mdptoolbox.example import forest
import numpy as np
from numpy.random import choice
import pandas as pd
from matplotlib import pyplot as plt

# In[2]:

np.random.seed(100)
P, R = forest(S=25, r1=20, r2=5, p=0.1)

# In[3]:


def test_policy(P, R, policy, test_count=1000, gamma=0.9):
    num_state = P.shape[-1]
    total_episode = num_state * test_count
    # start in each state
    total_reward = 0
    for state in range(num_state):
        state_reward = 0
        for state_episode in range(test_count):
            episode_reward = 0
            disc_rate = 1
            while True:
示例#18
0
文件: main.py 项目: Cphrampus/CS7641
        for i in range(lake_size):
            for j in range(lake_size):
                text = ax.text(j,
                               i,
                               pol_matrix[i, j],
                               ha="center",
                               va="center",
                               color="w")

        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_title("QL policy")
        plt.savefig("charts/lake_ql_viz")

    # non grid world, forest, large, 5000 states
    transitions, rewards = example.forest(S=5000)

    # tune PI/VI gamma values
    tune_gamma = False
    if tune_gamma:
        gamma_range = np.linspace(0.01, 0.99, 99)
        vi_iter = []
        pi_iter = []
        vi_time = []
        pi_time = []
        vi_max_v = []
        pi_max_v = []

        for gamma in gamma_range:
            vi = mdp.ValueIteration(transitions,
                                    rewards,
示例#19
0
from hiive.mdptoolbox.mdp import ValueIteration, QLearning, PolicyIteration
from hiive.mdptoolbox.example import forest
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

P, R = forest(2000)

compare_VI_QI_policy = []  # True or False
compare_VI_PI_policy = []

Gamma = 1
Epsilon = 0.0000000000000000000000000000000000000000000000000000000000000000000000000001
Max_Iterations = 200000

VI = ValueIteration(P, R, Gamma, Epsilon, Max_Iterations)

# run VI
VI.setVerbose()
VI.run()
print('VI')
print(VI.iter)
print(VI.time)
print(VI.run_stats[-1:])

iterations = np.zeros(len(VI.run_stats))
reward = np.zeros(len(VI.run_stats))
i = 0
for stat in VI.run_stats:
    iterations[i] = stat['Iteration']
    reward[i] = stat['Reward']