Пример #1
0
def test_instantiation():
    """
    Testing common QLearner initial arguments and support functions.
    """
    # Set-up:
    STATES = 10
    ACTIONS = 5
    rmatrix_sq = np.random.rand(STATES, STATES)
    rmatrix_rec = np.random.rand(STATES, ACTIONS)
    tmatrix = np.random.randint(0, STATES, size=(STATES, ACTIONS))
    # making sure tmatrix points to goal states:
    tmatrix[:, ACTIONS - 1] = np.random.randint(0, 1, size=STATES)
    goal_l = (0, 1)
    goal_f = lambda x: x <= 1
    np.savetxt('test.dat', rmatrix_sq)
    global QLEARNER

    # Test 1: list goal
    temp = QLearner(rmatrix_sq, goal_l)
    assert np.array_equal(temp.rmatrix,
                          rmatrix_sq), "R matrix not equal to arg."
    assert temp.goal(0) and temp.goal(1) and not temp.goal(2) and not temp.goal(3), \
            'List goal not working.'
    QLEARNER = temp

    # Test 2: function goal
    temp = QLearner(rmatrix_sq, goal_f)
    assert temp.goal(0) and temp.goal(
        1) and not temp.goal(2), 'Function goal not working.'
    QLEARNER = temp

    # Test 3: File I/O
    temp = QLearner('test.dat', goal_l)
    assert temp.qmatrix.shape == rmatrix_sq.shape, "Q & R matrix dimension mismatch."
    assert np.array_equal(temp.rmatrix,
                          rmatrix_sq), "R matrix not equal to arg."
    QLEARNER = temp

    # Test 4: rectangular r matrix, no tmatrix
    try:
        QLearner(rmatrix_rec, goal_l)
    except ValueError:
        pass

    # Test 5: rectangular r matrix, t matrix of same dimension
    temp = QLearner(rmatrix_rec, goal_f, tmatrix)
    assert temp.next_state(1,
                           2) == tmatrix[1,
                                         2], 'Next state prediction incorrect.'
    QLEARNER = temp

    # Test 6: episodes
    l = set(temp.episodes(coverage=1.0, mode='bfs'))
    assert l == set(range(temp.num_states)), 'Full episode coverage failed.'

    # Finalize
    os.remove('test.dat')
Пример #2
0
    def set_up_learner(self, learner, **kwargs):
        """
        Attaches the appropriate learner to instance for testing.
        """
        if learner == FLearner:
            sflags = FlagGenerator(self.size, self.size)
            aflags = FlagGenerator(2, 2)
            self.learner = FLearner(rmatrix=self.rmatrix,
                                    goal=self.goals,
                                    stateconverter=sflags,
                                    actionconverter=aflags,
                                    tmatrix=self.tmatrix,
                                    seed=self.seed,
                                    **kwargs)
        elif learner == QLearner:
            self.learner = QLearner(rmatrix=self.rmatrix,
                                    goal=self.goals,
                                    tmatrix=self.tmatrix,
                                    seed=self.seed,
                                    **kwargs)

        elif learner == SLearner:
            sflags = FlagGenerator(self.size, self.size)
            aflags = FlagGenerator(2, 2)
            sim = create_sim_env(self.size, self.random)

            def reward(svec, avec, nstate):
                action = aflags.encode(avec)
                state = sflags.encode((round(svec[0]), round(svec[1])))
                return self.rmatrix[state, action]

            def goal(svec):
                return self.coord2state(
                    (round(svec[0]), round(svec[1]))) in self.goals

            self.learner = SLearner(reward=reward,
                                    simulator=sim,
                                    goal=goal,
                                    stateconverter=sflags,
                                    actionconverter=aflags,
                                    seed=self.seed,
                                    **kwargs)
        elif learner is None:
            self.learner = None
        else:
            raise TypeError('Class: ' + learner.__name__ + ' is not supported.\
                             Assign to .learner manually')
Пример #3
0
def branin(discount, learning_rate, buckets_w, buckets_h, buckets_v):
    def run_game():
        # Make a new monkey object.
        swing = SwingyMonkey(
            visual=False,  # no video
            sound=False,  # no audio        
            action_callback=learner_class.action_callback,
            reward_callback=learner_class.reward_callback)

        # Loop until you hit something.
        while swing.game_loop():
            pass

        return swing

    # make a new learner with the given parameters
    learner_class = QLearner(learn_fn=lambda i: learning_rate,
                             discount_fn=lambda i: discount,
                             bucket_height=buckets_h,
                             bucket_width=buckets_w,
                             velocity_bucket=buckets_v)

    # train the learner
    for t in xrange(TRAIN_ITERS):
        run_game()

    # keep learning, take average over the iterations
    scores = []
    for t in xrange(TEST_ITERS):
        # Make a new monkey object.
        swing = run_game()

        scores.append(swing.score)

    avg_score = float(sum(scores)) / float(TEST_ITERS)
    median_score = np.median(scores)

    # which do we return?
    print "The median is %d and the mean is %f." % (median_score, avg_score)

    # out objective is to minimize the negative of the average score
    return -1 * avg_score
Пример #4
0
    OUT proximity refers to outside of the quartile of the player
    """
    NUM_STATES = 32 * (54**args.numTeammates)

    # Shoot, Dribble or Pass to one of N teammates or
    NUM_ACTIONS = 2 + args.numTeammates

    hfo = HFOEnvironment()
    hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET,
                        server_port=args.port)

    if args.inQTableDir:
        q_learner = QLearner(
            NUM_STATES,
            NUM_ACTIONS,
            epsilon=args.epsilon,
            learning_rate=args.learningRate,
            q_table_in=args.inQTableDir + str(args.playerIndex) + '.npy',
            q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy')
    else:
        q_learner = QLearner(
            NUM_STATES,
            NUM_ACTIONS,
            epsilon=args.epsilon,
            learning_rate=args.learningRate,
            q_table_in=args.outQTableDir + str(args.playerIndex) + '.npy',
            q_table_out=args.outQTableDir + str(args.playerIndex) + '.npy')

    for episode in range(0, args.numEpisodes):
        status = IN_GAME
        action = None
Пример #5
0
# main
from MapBuilder import MapBuilder
from qlearner import QLearner
from universe import Universe
from Criterions import get_cost_based_on_fuel, get_cost_based_on_time, get_cost_based_on_mixture

if __name__ == "__main__":

    universe = Universe(MapBuilder())
    qlearners = [
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_fuel, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state),
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_time, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state),
        QLearner(universe.get_initial_state(),
                 get_cost_based_on_mixture, universe.move_request,
                 universe.get_terminal_state(), 1, 0.9, universe.next_state)
    ]

    num_of_epochs = 1000

    for epoch_num in range(num_of_epochs):
        for qlearner in qlearners:
            while qlearner._state != universe.get_terminal_state():
                qlearner.move()
            qlearner.reset(universe.get_initial_state())

    print("Energy:", qlearners[0]._Q, end='\n\n')
    print("Time:", qlearners[1]._Q, end='\n\n')
Пример #6
0
# L.Braun 2018
# Main program to solve a gridworld maze problem

# Uses qlearner.py, environ.py

from qlearner import QLearner
import pylab as plt

my_learner = QLearner()
my_learner.load_maze('/u/braun/tlab/QLearner/data/reward_4x4.npy',
                     '/u/braun/tlab/QLearner/data/meta_4x4.txt')

#print ("testing data load\n\n")

#my_learner.display_Q()
#my_learner.display_R()

print("begin training...")

reward = my_learner.train(0.7)

my_learner.display_Q()
my_learner.display_R()

steps = my_learner.test(7)  # 7 foods in 4x4 maze
print("steps")
print(steps)
print("")

plt.hist(reward, 50, normed=1, facecolor='g', alpha=0.75)
plt.xlabel('Episodes required to reach 200')
### SETUP
num_learning_trials = 10000
num_simulation_trials = 1000
num_learning_epochs = 15

### PART III: MDP 1 epsilon experiments
epsilon_list = [0.1, 0.25, 0.5, 0.75]
learning_rate = 0.01
epoch_list = []
avg_reward_list = []

for e, epsilon in enumerate(epsilon_list):
    print "Epsilon: {0}".format(epsilon)

    qlearner = QLearner(mdp1,
                        initial_state1,
                        epsilon=epsilon,
                        alpha=learning_rate)

    epoch_list.append(range(num_learning_epochs))
    avg_reward_list.append([])
    for epoch in epoch_list[e]:
        for trial in range(num_learning_trials):
            qlearner.run_learning_trial()

        avg_reward = 0
        for trial in range(num_simulation_trials):
            (total_reward, state_seq,
             action_seq) = qlearner.run_simulation_trial()
            avg_reward += total_reward
        avg_reward = 1. * avg_reward / num_simulation_trials
        avg_reward_list[e].append(avg_reward)
Пример #8
0
    def __init__(self, config_or_model, load_model=False):
        self.config = None
        self.model_loaded = False
        #load a saved model
        if load_model:
            print("Loading model from: {}".format(config_or_model))
            load_path = Path(config_or_model)
            if (not load_path.exists()) or (not load_path.is_dir()):
                print("Error: directory doesn't exist")

            config_filename = load_path.joinpath("config.json")
            self.config = self.load_config(str(config_filename))
        else:
            self.config = self.load_config(config_or_model)

        #select game
        self.game_name = self.config["game"]
        self.game = None
        if self.game_name == "snake":
            self.game = game.Snake
        elif self.game_name == "box":
            self.game = game.Box
        else:
            print("Error: unknown game {}".format(self.game_name))

        self.nn_config = self.config["nn"]
        #parameters of experience memory
        self.memory_size = self.config["memory_size"]
        self.memory_alpha = self.config["memory_alpha"]
        self.memory_beta_start = self.config["memory_beta_start"]
        self.memory_beta_end = self.config["memory_beta_end"]
        self.memory_beta_num_steps = self.config["memory_beta_num_steps"]
        self.memory_beta_step = (self.memory_beta_end - self.memory_beta_start
                                 ) / self.memory_beta_num_steps
        self.exp_memory_start_size = self.config["memory_start_size"]
        #game parameters: image size, board size, num_goals, ...
        self.width = self.config["width"]
        self.height = self.config["height"]
        self.image_scale_factor = self.config["image_scale_factor"]
        self.num_goals = self.config["num_goals"]
        self.img_width = self.width * self.image_scale_factor
        self.img_height = self.height * self.image_scale_factor
        self.num_img_channels = self.game.num_channels
        self.num_actions = self.game.num_actions

        #random policy parameters
        self.epsilon_start = self.config["epsilon_start"]
        self.epsilon_min = self.config["epsilon_min"]
        self.num_epsilon_steps = self.config["num_epsilon_steps"]
        self.epsilon_step = (self.epsilon_start -
                             self.epsilon_min) / self.num_epsilon_steps

        #scale rewards, training might be more stable if q-values converge to range [-1,1]
        self.scale_reward_max = None
        if "scale_reward_max" in self.config:
            self.scale_reward_max = self.config["scale_reward_max"]
            self.game.max_reward *= self.scale_reward_max
            self.game.min_reward *= self.scale_reward_max
            self.game.empty_reward *= self.scale_reward_max
            print("Scaling rewards by {}".format(self.scale_reward_max))

        #frequence parameters of updating target network, output, saving, tensorboard, evaluation
        self.max_steps = self.config["max_steps"]
        self.output_freq = self.config["output_freq"]
        self.update_freq = self.config["update_freq"]
        self.target_network_update_mode = self.config[
            "target_network_update_mode"]
        self.target_network_update_tau = None
        self.target_network_update_freq = None
        if self.target_network_update_mode == "hard":
            self.target_network_update_freq = self.config[
                "target_network_update_freq"]
        else:
            self.target_network_update_tau = self.config[
                "target_network_update_tau"]
        self.eval_freq = self.config["eval_freq"]
        self.eval_steps = self.config["eval_steps"]
        self.tensorboard_log_freq = self.config["tensorboard_log_freq"]
        self.tensorboard_log_path = self.config["tensorboard_log_path"]
        self.save_freq = self.config["save_freq"]
        self.save_path = self.config["save_path"]

        self.batch_size = self.config["batch_size"]

        #parameters that are actually changed while training, these need to be saved and loaded
        self.curr_step = 0
        self.epsilon = self.epsilon_start
        self.memory_beta = self.memory_beta_start
        self.best_average_score = 0

        #create experience memory
        self.exp_memory = ExperienceMemory(self.memory_size, self.img_width,
                                           self.img_height,
                                           self.num_img_channels,
                                           self.memory_alpha)
        #create QLearner object, load saved neural network model if necessary
        self.qlearner = None
        if load_model:
            load_path = str(
                Path(config_or_model).joinpath("nn").joinpath("model"))
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                load_model=load_path,
                target_network_update_tau=self.target_network_update_tau)
            self.curr_step = self.config["curr_step"]
            self.epsilon = self.config["epsilon"]
            self.memory_beta = self.config["memory_beta"]
            self.best_average_score = self.config["best_average_score"]
            print("Model loaded successfully")
            self.model_loaded = True
        else:
            self.qlearner = QLearner(
                self.nn_config,
                self.num_actions,
                self.img_width,
                self.img_height,
                self.num_img_channels,
                self.memory_size,
                target_network_update_tau=self.target_network_update_tau)

        if self.tensorboard_log_freq > 0:
            self.qlearner.add_tensorboard_ops(self.tensorboard_log_path)
Пример #9
0
        Pass opening angle, SMALL or LARGE or INVALID       -- 3
        Goal scoring angle, SMALL or LARGE or INVALID       -- 3


    OUT proximity refers to outside of the quartile of the player
    """
    NUM_STATES = 32 * (54 ** args.numTeammates)

    # Shoot, Pass to one of N teammates or Dribble
    NUM_ACTIONS = 2 + args.numTeammates

    hfo = HFOEnvironment()
    hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=args.port)

    q_learner = QLearner(NUM_STATES, NUM_ACTIONS,
                         epsilon=0.0,
                         q_table_in=args.qTableDir + str(args.playerIndex) + '.npy',
                         q_table_out=args.qTableDir + str(args.playerIndex) + '.npy')

    for episode in range(0, args.numEpisodes):
        status = IN_GAME
        action = None
        state = None
        history = []
        timestep = 0
        while status == IN_GAME:
            timestep += 1
            features = hfo.getState()
            # Print off features in a readable manner
            # feature_printer(features, args.numTeammates, args.numOpponents)

            if int(features[5] != 1):
# Initialise result data structures
rewards_per_run = dict()
runtime_per_run = []

# For each run, train agent until environment is solved, or episode budget
# runs out:
for run in range(num_runs):
    # Initialise result helpers
    end_episode = num_episodes  # indicates in which run the environment was solved
    start = timer()
    rewards = [0.0] * num_episodes  # reward per episode

    # Initialise environment and agent
    wrapper = CartPoleWrapperDiscrete()
    agent = QLearner(wrapper=wrapper, seed=run)

    style.use('fivethirtyeight')

    fig = plt.figure()
    plt.axis([0, args.episodes, 0, 300])
    plt.xlabel('Episodes')
    plt.ylabel('AVG Reward last 50 episodes')

    # For each episode, train the agent on the environment and record the
    # reward of each episode
    for episode in range(num_episodes):
        rewards[episode] = agent.train()
        if (episode % 50) == 0 and episode != 0:
            avg_last = float(sum(rewards[episode - 50:episode])) / 50
            plt.scatter(episode, avg_last)
Пример #11
0
def frozen_ql_experiment(env_name, new_lake):
    np.random.seed(0)
    min_r = -100.0
    max_r = 100.0
    problem = MyWrapper.TransformReward(
        gym.make(env_name, desc=new_lake),
        lambda r: np.clip(r * 100.0, min_r, max_r))
    problem.seed(0)
    problem.reset()
    folder = "q_learning/"
    env = MyWrapper.Monitor(problem, folder, force=True)
    # env.observation_space.n is number of states

    # q_table = np.zeros((env.observation_space.n, env.action_space.n)) # param -> q_table
    num_of_states = env.observation_space.n
    num_of_action = env.action_space.n
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    alpha = [0.5, 0.9]  # param -> alpha  [0.45, 0.65, 0.85] current 0.45
    gamma = 0.99  # param -> gamma
    episodes = 10000
    rar = [0.1, 0.9]  # epsilon [0.1,0.3,0.5,0.7,0.9], current 0.1
    radr = 0.99  # randomess decay
    time_list = []
    # begin the timer before the iteration begin

    # initialize the qlearner here
    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=alpha[0],
        gamma=gamma,
        rar=rar[0],
        radr=radr,
    )
    # print(qlearner.q_table)
    """This is for plot #1 """
    # total time spend per episode
    init_time_diff = 0

    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time

        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide

    # close the environment,  find the time difference
    env.close()

    def chunk_list(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    """rewards vs # of iterations plot"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title(
        "Average Rewards vs Iterations (learning rate: 0.5, Epsilon: 0.1)")
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Reward")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_reward_vs_iterations.png"
    )
    plt.close()
    plt.figure()
    """plot 1 done """
    """Plot 2 computation time vs episodes """
    plt.title(
        "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.1)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes.png")
    plt.close()
    plt.figure()
    """This is for plot #3 change alpha:0.9, rar 0.1 """
    # plot 2 alpha = 0.65 vs reward
    single_alpha = alpha[1]  # alpha = 0.9
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    time_list = []
    init_time_diff = 0

    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=single_alpha,
        gamma=gamma,
        rar=rar[0],
        radr=radr,
    )
    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """
            # start the timer

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time
        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide

    # close the environment,  find the time difference
    """plot 3"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title("Reward vs Iteration (Learning Rate: 0.9, Epsilon:0.1)")
    # print(single_alpha)
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Rewards")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_alpha0.9.png"
    )
    plt.close()
    plt.figure()
    """plot 4 time vs iters"""
    plt.title(
        "Computation time vs episodes (learning rate: 0.9, Epsilon: 0.1)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes_alpha0.9.png"
    )
    plt.close()
    plt.figure()
    """This is for plot #4  alpha: 0.5, rar(epsilon) 0.9"""
    single_alpha = alpha[0]  # alpha = 0.9
    single_rar = rar[1]
    rewards_list = []  # this will record reward for that run
    iterations_list = []  # this will record all number of iteration
    time_list = []
    init_time_diff = 0

    qlearner = QLearner(
        num_actions=num_of_action,
        num_states=num_of_states,
        alpha=single_alpha,
        gamma=gamma,
        rar=single_rar,
        radr=radr,
    )
    for episode in range(episodes):  # total number of iterations
        start_time = time.time()
        qlearner.s = env.reset()  # current state

        done = False
        total_reward = 0  # this is the initial reward i have
        max_steps = 10000

        # print(state)
        for i in range(max_steps):
            if done:
                break
            # update qlearner.s by state
            """Key step, refer to the qlearner implementation """
            # start the timer

            # update s before use as an input
            # action here is either a random action or the best action of the given state
            action = qlearner.choose_best_action(
                qlearner.num_actions, qlearner.rar, qlearner.s,
                qlearner.q_table)  # use current q_table
            # qlearner.s = qlearner.s
            # get state reward  done, info from the environment
            next_state, reward, done, info = env.step(
                action)  # this will update done
            # qlearner.s = qlearner.s  already updated
            qlearner.a = action
            # update my reward
            total_reward += reward
            """  right now the problem is that q table is not being updated"""
            # reward is current reward, total_reward is cumulative reward
            # update q-table on q[qlearner.s, action] using state(future_state) and reward,
            temp_action = qlearner.query(
                next_state, reward,
                False)  # this step will not update self.s and self.a
            # update state to next state, action is already updated, we good
            qlearner.s = next_state

        end_time = time.time()
        time_spend_one_episode = (end_time - start_time) * 1000
        init_time_diff += (time_spend_one_episode
                           )  # by the end of iteration cumulative time
        time_list.append(init_time_diff)

        rewards_list.append(total_reward)  # total rewards for this episode
        iterations_list.append(
            i)  # record current iteration when it's done for the episide
    """plot 5 reward vs iteration"""
    episode_size = int(episodes / 50)
    segments = list(chunk_list(rewards_list, episode_size))
    average_reward = [sum(segment) / len(segment) for segment in segments]

    plt.title("Reward vs Iteration (Learning Rate: 0.5, Epsilon:0.9)")
    # print(single_alpha)
    plt.plot(range(0, len(rewards_list), episode_size), average_reward)
    plt.xlabel("Iterations")
    plt.ylabel("Average Rewards")
    plt.savefig(
        "./plots/frozen_lake_experiment/frozen_qlearner_rewards_vs_iter_epsilon0.9.png"
    )
    plt.close()
    plt.figure()
    """plot 6 time vs iters"""
    plt.title(
        "Computation time vs episodes (learning rate: 0.5, Epsilon: 0.9)")
    plt.plot(range(0, episodes, 1), time_list)
    plt.xlabel("episodes")
    plt.ylabel("computation time (mili seconds)")
    plt.savefig(
        "./plots/frozen_lake_experiment/computation_time_vs_episodes_epsilon0.9.png"
    )
    plt.close()
    plt.figure()