Exemplo n.º 1
0
def update(agent: RL, env: Maze):
    """Overall learning flow

    Parameters
    ----------
    agent : RL
        agent to interact with environment
    Maze : Maze
        environment give feedback using state and action
    """
    episodes = 100
    for episode in range(episodes):
        state = env.reset()
        steps = 0
        while True:
            env.render()
            action = agent.choose_action(str(state))
            n_state, reward, done = env.step(action)
            agent.learn(str(state), action, reward, str(n_state))
            state = n_state
            steps += 1
            if done:
                break
        print(f"Episode [{episode+1:03d}/{episodes:03d}]: {steps}")
    print('game over')
    env.destroy()
def m_test(judge_number):
    global MonteCarlo_brain_
    global env
    env = Maze(height=10, width=10)
    MonteCarlo_brain_ = Model.Monte(greedy_rate=0.5,
                                    learning_rate=0.9,
                                    reward_decay=0.9)
    T1 = time.perf_counter()
    update(judge_number=judge_number,
           judge_method='repeated steps',
           delay_time=0.00)
    T2 = time.perf_counter()
    running_time = (T2 - T1) * 1000
    episode = max(plot_episode)
    env.destroy()
    env.mainloop()
    return running_time, episode, plot_sum_reward
def s_test(judge_number):
    # This function is designed to be imported in other file to carried out test and multiple Sarse learning cases.
    # Such as run the Sarsa learning 20 times to get an average episode number and consuming time.
    global Sarsa_brain_
    global env
    env = Maze(height=10, width=10)
    Sarsa_brain_ = Model.SARSA(greedy_rate=0.9, learning_rate=0.01, reward_decay=0.9)
    T1 = time.perf_counter()
    update(judge_number=judge_number, judge_method='repeated steps', delay_time=0.00)
    T2 = time.perf_counter()
    running_time = (T2 - T1) * 1000
    episode = max(plot_episode)
    env.destroy()
    env.mainloop()
    plot_episode.clear()
    # This step is a very special bug. We import this function and run this in a for loop.
    # The data in plot_episode will be stored in thr for loop.
    # We must clear it at the end.
    return running_time, episode, plot_sum_reward
Exemplo n.º 4
0
def q_test(judge_number):
    global Q_brain_
    global env

    env = Maze(height=10, width=10)
    Q_brain_ = Model.Qlearning(greedy_rate=0.9,
                               learning_rate=0.01,
                               reward_decay=0.9)
    T1 = time.perf_counter()
    update(judge_number=judge_number,
           judge_method='repeated steps',
           delay_time=0.00)
    T2 = time.perf_counter()
    running_time = (T2 - T1) * 1000
    episode = max(plot_episode)
    env.destroy()
    env.mainloop()
    plot_episode.clear()
    # This step is a very special bug. We import this function and run this in a for loop.
    # The data in plot_episode will be stored in thr for loop.
    # We must clear it at the end.
    return running_time, episode, plot_sum_reward
Exemplo n.º 5
0
            if (step > 20)and (i_step % 10 ==0):
                q_learning.update()
                print("learning ")
            if (step > 20)and (i_step %20 ==0):
                    q_learning.update_target_net()
                
            
            ##add reward
            R.append(reward)
            if done:
                break;
        q_learning.add_record(sum(R), i_step)
        
            
    print("Done !!!")
    env.destroy()

    
    steps = [q_learning.reward_his[x][1] for x in range(episode) ]
    
    plt.figure()
    plt1 = plt.subplot(121)
    plt1.plot(range(len(steps)), np.array(steps))
    plt2 = plt.subplot(122)
    plt2.plot(range(len(q_learning.loss_his)), q_learning.loss_his)
    plt.show() 




Exemplo n.º 6
0
class CustomGym(Env):
    """The main OpenAI Gym class. It encapsulates an environment with
    arbitrary behind-the-scenes dynamics. An environment can be
    partially or fully observed.
    The main API methods that users of this class need to know are:
        step
        reset
        render
        close
        seed
    And set the following attributes:
        action_space: The Space object corresponding to valid actions
        observation_space: The Space object corresponding to valid observations
        reward_range: A tuple corresponding to the min and max possible rewards
    Note: a default reward range set to [-inf,+inf] already exists. Set it if you want a narrower range.
    The methods are accessed publicly as "step", "reset", etc...
    """
    def __init__(
        self,
        agentXY,
        goalXY,
        walls=[],
        pits=[],
        title='Maze',
    ):
        super(CustomGym, self).__init__()
        self.env = Maze(agentXY, goalXY, walls, pits, title)
        self.title = title
        self.action_space = spaces.Discrete(self.env.n_actions)
        self.observation_space = spaces.Box(low=0,
                                            high=0,
                                            shape=(4, ),
                                            dtype=np.float32)

        self.rewards = [[]]
        self.variance = []
        self.median = []

    def step(self, action):
        """Run one timestep of the environment's dynamics. When end of
        episode is reached, you are responsible for calling `reset()`
        to reset this environment's state.
        Accepts an action and returns a tuple (observation, reward, done, info).
        Args:
            action (object): an action provided by the agent
        Returns:
            observation (object): agent's observation of the current environment
            reward (float) : amount of reward returned after previous action
            done (bool): whether the episode has ended, in which case further step() calls will return undefined results
            info (dict): contains auxiliary diagnostic information (helpful for debugging, and sometimes learning)
        """

        s_, reward, done = self.env.step(action)

        self.rewards[-1].append(reward)
        if done:
            self.variance.append(np.var(self.rewards[-1]))
            self.median.append(np.median(self.rewards[-1]))
            self.rewards.append([])

        return s_, reward, done, {}

    def render(self, mode='human'):
        self.env.render()

    def reset(self, value=1, resetAgent=True):
        return self.env.reset()

    def seed(self, seed=None):
        """Sets the seed for this env's random number generator(s).
        Note:
            Some environments use multiple pseudorandom number generators.
            We want to capture all such seeds used in order to ensure that
            there aren't accidental correlations between multiple generators.
        Returns:
            list<bigint>: Returns the list of seeds used in this env's random
              number generators. The first value in the list should be the
              "main" seed, or the value which a reproducer should pass to
              'seed'. Often, the main seed equals the provided 'seed', but
              this won't be true if seed=None, for example.
        """
        np.random.seed(10)
        random.seed(10)
        return

    def save_csv(self):
        with open(f"./data/{self.title}_rewards_{time.time()}",
                  "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            csvWriter.writerows(self.rewards)
        with open(f"./data/{self.title}_variance_{time.time()}",
                  "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            for var in self.variance:
                csvWriter.writerow([var])
        with open(f"./data/{self.title}_median_{time.time()}", "w+") as my_csv:
            csvWriter = csv.writer(my_csv, delimiter=',')
            for med in self.median:
                csvWriter.writerow([med])

    def destroy(self):
        self.env.destroy()