Exemplo n.º 1
0
def train():
    global count, max_rewards
    for i in range(total_episode):
        state = environment.reset()
        state, stacked_frames = stack_frames(None, state, True)
        total_rewards = 0

        for _ in range(max_steps):
            action, action_index = act(state)
            next_state, reward, done, _ = environment.step(action)
            environment.render()
            next_state, stacked_frames = stack_frames(stacked_frames,
                                                      next_state, False)
            total_rewards += reward

            if done:
                print(f"episode: {i}, total_rewards: {total_rewards}")
                next_state = environment.reset()
                next_state, stacked_frames = stack_frames(
                    None, next_state, True)
                memory.add((state, action_index, next_state, reward, done))

                if total_rewards > max_rewards:
                    max_rewards = total_rewards
                    torch.save(dqn.state_dict(), 'model/model.best.ckpt(1)')

                break
            else:
                memory.add((state, action_index, next_state, reward, done))

            # learn()
            count += 1

            if count % save_iter == 0:
                torch.save(dqn.state_dict(), 'model/model.latest.ckpt')
Exemplo n.º 2
0
    def play(self):
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(
                    self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output,
                                  feed_dict={self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    self.env.render()

                    total_rewards += reward

                    if done:
                        print("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break

                    next_state, self.stacked_frames = stack_frames(
                        self.stacked_frames, next_state, False)
                    state = next_state
            self.env.close()
Exemplo n.º 3
0
def pretrain_memory():
    state = environment.reset()
    state, stacked_frames = stack_frames(None, state, True)
    for i in range(pretrain_length):
        action_index = random.randint(0, environment.action_space.n - 1)
        action = actions[action_index]
        next_state, reward, done, _ = environment.step(action)
        next_state, stacked_frames = stack_frames(stacked_frames,
                                                  next_state,
                                                  is_done=done)

        if done:
            next_state = np.zeros(environment.observation_space.shape)
            memory.add((state, action_index, next_state, reward, done))
            state = environment.reset()
            state, stacked_frames = stack_frames(None, state, True)
        else:
            memory.add((state, action_index, next_state, reward, done))
            state = next_state
    def __init__(self, level_name):  
        self.level_name = level_name  
        # setup environment
        self.env = gym_super_mario_bros.make(level_name)
        self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT)
        # one hot encoded version of our actions
        self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist())

        # resest graph
        tf.reset_default_graph()
        
        # instantiate the DQNetwork
        self.DQNetwork = DQNetwork(state_size, action_size, learning_rate)
        
        # instantiate memory
        self.memory = Memory(max_size=memory_size)
        
        # initialize deque with zero images
        self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4)

        for i in range(pretrain_length):    
            # If it's the first step
            if i == 0:
                state = self.env.reset()        
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

            # Get next state, the rewards, done by taking a random action
            choice = random.randint(1, len(self.possible_actions)) - 1
            action = self.possible_actions[choice]
            next_state, reward, done, _ = self.env.step(choice)

            # stack the frames
            next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

            # if the episode is finished (we're dead)
            if done:
                # we inished the episode
                next_state = np.zeros(state.shape)

                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # start a new episode
                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)
            else:
                # add experience to memory
                self.memory.add((state, action, reward, next_state, done))

                # our new state is now the next_state
                state = next_state
       
        # saver will help us save our model
        self.saver = tf.train.Saver()

        # setup tensorboard writer
        self.writer = tf.summary.FileWriter("logs/")

        # losses
        tf.summary.scalar("Loss", self.DQNetwork.loss)
        
        self.write_op = tf.summary.merge_all()
    def train(self):        
        with tf.Session() as sess:
            # initialize the variables
            sess.run(tf.global_variables_initializer())

            # initialize decay rate (that will be used to reduce epsilon)
            decay_step = 0

            for episode in range(total_episodes):
                # set step to 0
                step = 0

                # initialize rewards of episode
                episode_rewards = []

                # make a new episode and opserve the first state
                state = self.env.reset()

                # remember that stack frame function
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("Episode:", episode)

                while step < max_steps:
                    step += 1
                    #print("step:", step)

                    # increase decay_step
                    decay_step += 1

                    # predict an action
                    action, choice, explore_probability = self.predict_action(sess,
                                                         explore_start, 
                                                         explore_stop, 
                                                         decay_rate, 
                                                         decay_step, 
                                                         state, 
                                                         self.possible_actions)

                    # perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)

                    if episode_render:
                        self.env.render()

                    # add the reward to total reward
                    episode_rewards.append(reward)

                    # the game is finished
                    if done:
                        print("done")
                        # the episode ends so no next state
                        next_state = np.zeros((110, 84), dtype=np.int)

                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # set step = max_steps to end episode
                        step = max_steps

                        # get total reward of the episode
                        total_reward = np.sum(episode_rewards)

                        print("Episode:", episode, 
                              "Total reward:", total_reward, 
                              "Explore P:", explore_probability, 
                              "Training Loss:", loss)

                        #rewards_list.append((episode, total_reward))

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))
                    else:
                        # stack frame of the next state
                        next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)

                        # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory
                        self.memory.add((state, action, reward, next_state, done))

                        # s_{i} := s_{i+1}
                        state = next_state

                    ### Learning part
                    # obtain random mini-batch from memory
                    batch = self.memory.sample(batch_size)
                    states_mb = np.array([each[0] for each in batch], ndmin=3)
                    actions_mb = np.array([each[1] for each in batch])
                    rewards_mb = np.array([each[2] for each in batch])
                    next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                    dones_mb = np.array([each[4] for each in batch])

                    target_Qs_batch = []

                    # get Q values for next_state
                    Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb})

                    # set Q_target = r if episode ends with s+1
                    for i in range(len(batch)):
                        terminal = dones_mb[i]

                    # if we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)

                    targets_mb = np.array([each for each in target_Qs_batch])

                    loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer],
                                      feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})

                    # write tf summaries
                    summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, 
                                                 self.DQNetwork.target_Q: targets_mb, 
                                                 self.DQNetwork.actions_: actions_mb})
                    self.writer.add_summary(summary, episode)
                    self.writer.flush()

                # save model every 5 episodes
                if episode % 5 == 0:
                    self.saver.save(sess, "models/{0}.cpkt".format(self.level_name))
                    print("Model Saved")
    def play_notebook(self):
        import matplotlib.pyplot as plt
        # imports to render env to gif
        from JSAnimation.IPython_display import display_animation
        from matplotlib import animation
        from IPython.display import display

        # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html
        def display_frames_as_gif(frames):
            """
            Displays a list of frames as a gif, with controls
            """
            #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72)
            patch = plt.imshow(frames[0])
            plt.axis('off')

            def animate(i):
                patch.set_data(frames[i])

            anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
            display(display_animation(anim, default_mode='loop'))

        frames = []
        with tf.Session() as sess:
            total_test_rewards = []

            # Load the model
            self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name))

            for episode in range(1):
                total_rewards = 0

                state = self.env.reset()
                state, self.stacked_frames = stack_frames(self.stacked_frames, state, True)

                print("****************************************************")
                print("EPISODE ", episode)

                while True:
                    # Reshape the state
                    state = state.reshape((1, *state_size))
                    # Get action from Q-network 
                    # Estimate the Qs values state
                    Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state})

                    # Take the biggest Q value (= the best action)
                    choice = np.argmax(Qs)

                    #Perform the action and get the next_state, reward, and done information
                    next_state, reward, done, _ = self.env.step(choice)
                    frames.append(self.env.render(mode = 'rgb_array'))

                    total_rewards += reward

                    if done:
                        print ("Score", total_rewards)
                        total_test_rewards.append(total_rewards)
                        break


                    next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False)
                    state = next_state

            self.env.close()

        display_frames_as_gif(frames)