示例#1
0
class DoubleAgent(object):
    def __init__(self,
                 game1,
                 game2,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 pretrained_subnet1 = False,
                 pretrained_subnet2 = False,
                 frameskip = 4,
                 frozen = False
                 ):
        """
        Inputs:
        - game 1: string to select the game 1
        - game 2: string to select the game 2
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - pretrained_subnet1: str path to the model of the subnet
        - pretrained_subnet2: str path to the model of the subnet
        - frozen: boolean freeze pretrained subnets
        """

        # Namestring
        self.game1 = game1
        self.game2 = game2

        # Environment
        self.env1 = Environment(game_name[game1], dimensions[game1], frameskip=frameskip)
        self.env2 = Environment(game_name[game2], dimensions[game2], frameskip=frameskip)


        # Neural net
        self.pretrained_subnet1 = pretrained_subnet1
        self.pretrained_subnet2 = pretrained_subnet2
        self.net = TwinDQN(channels_in = state_buffer_size,
                             num_actions = self.env2.get_number_of_actions(),
                             pretrained_subnet1 = pretrained_subnet1,
                             pretrained_subnet2 = pretrained_subnet2,
                             frozen = frozen)
        self.target_net = TwinDQN(channels_in = state_buffer_size,
                                    num_actions = self.env2.get_number_of_actions(),
                                    pretrained_subnet1 = pretrained_subnet1,
                                    pretrained_subnet2 = pretrained_subnet2,
                                    frozen = frozen)

        # Cuda
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        # Pretrained
        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.net.parameters()),
                                    lr=learning_rate)
        #self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.net.parameters()),
        #                               lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        - action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env2.sample_action()
            action = LongTensor([[action]])

        return action


    def map_action(self, action):
        """
        Maps action from game with more actions
        to game with less actions

        Inputs:
        - action: int
        Returns:
        - action: int
        """
        # Map SpaceInvaders on Breakout
        if self.game1=='Breakout' and self.game2=='SpaceInvaders':
            if action>3: # shoot+right/left --> right/left
                return action-2

        # Map Assault on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Assault':
            if action!=0: # all actions except 2nd idle
                return action-1

        # Map Phoenix on SpaceInvaders
        if self.game1=='SpaceInvaders' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 1
            if action>4: # shoot+right/left --> shoot+right/left
                return action-1

        # Map Phoenix on Assault
        if self.game1=='Assault' and self.game2=='Phoenix':
            if action==4: # shield --> idle
                return 0
            if action==7: # shield+shot --> shot
                return 2
            if 1<= action and action<=3: # shot/right/left --> shot/right/left
                return action+1

        # No mapping necessary
        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()


        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable

            # Score counter
            total_reward_game1 = 0
            total_reward_game2 = 0
            total_reward = 0

            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                # perform selected action on game
                screen1, reward1, _, done1, _ = self.env1.step(action1, mode='play')
                screen2, reward2, _, done2, _ = self.env2.step(action2, mode='play')

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)

                # save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2
            print('Final scores Game ({}/{}): {}: {}    '.format(i+1, n, self.game1, total_reward_game1) +
                  '{}: {}    '.format(self.game2, total_reward_game2) +
                  'total: {}'.format(total_reward))
        self.env1.game.close()
        self.env2.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game1 + '+' + self.game2 + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history total
        reward_history = []
        reward_clamped_history = []
        # Store history game 1
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Store history game 2
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Number of actions to sample from
        n_actions = self.env2.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]
                action1 = self.map_action(action)
                action2 = action

                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                if mode=='evaluation':
                    # save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),gray2pytorch(screen2)),dim=1)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

                # Merged game over indicator
                done = done1 or done2

            # Print current result
            print('Episode: {:6}/{:6} |   '.format(i_episode, n_games) +
                  'score total: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped,total_reward) +
                  'score game1: ({:6.1f}/{:7.1f}) |   '.format(total_reward_clamped_game1,total_reward_game1) +
                  'score game2: ({:6.1f}/{:7.1f})'.format(total_reward_clamped_game2,total_reward_game2))

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward_total = np.sum(reward_history) / len(reward_history)
        avg_reward_total_clamped = np.sum(reward_clamped_history) / len(reward_clamped_history)
        avg_reward_game1 = np.sum(reward_history_game1) / len(reward_history_game1)
        avg_reward_game1_clamped = np.sum(reward_clamped_history_game1) / len(reward_clamped_history_game1)
        avg_reward_game2 = np.sum(reward_history_game2) / len(reward_history_game2)
        avg_reward_game2_clamped = np.sum(reward_clamped_history_game2) / len(reward_clamped_history_game2)

        # Print final result
        print('\n\n===========================================\n' +
              'avg score after {:6} episodes:\n'.format(n_games) +
              'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
              'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
              'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes:\n'.format(n_games) +
                     'avg total: ({:6.1f}/{:7.1f})\n'.format(avg_reward_total_clamped,avg_reward_total) +
                     'avg game1: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game1_clamped,avg_reward_game1) +
                     'avg game2: ({:6.1f}/{:7.1f})\n'.format(avg_reward_game2_clamped,avg_reward_game2))

        # Dump reward
        with open(sub_dir + mode + '_reward_game1.pickle', 'wb') as fp:
            pickle.dump(reward_history_game1, fp)
        with open(sub_dir + mode + '_reward_game2.pickle', 'wb') as fp:
            pickle.dump(reward_history_game2, fp)
        with open(sub_dir + mode + '_reward_total.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)

        with open(sub_dir + mode + '_reward_clamped_game1', 'wb') as fp:
            pickle.dump(reward_clamped_history_game1, fp)
        with open(sub_dir + mode + '_reward_clamped_game2', 'wb') as fp:
            pickle.dump(reward_clamped_history_game2, fp)
        with open(sub_dir + mode + '_reward_clamped_total', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game1 + '+' + self.game2 + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + 'train.txt'
        reward_file = sub_dir + 'reward.pickle'
        reward_file_game1 = sub_dir + 'reward_game1.pickle'
        reward_file_game2 = sub_dir + 'reward_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        reward_clamped_file_game1 = sub_dir + 'reward_clamped_game1.pickle'
        reward_clamped_file_game2 = sub_dir + 'reward_clamped_game2.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        # Total scores
        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        reward_history = []
        reward_clamped_history = []
        # Scores game 1
        avg_score_game1 = 0
        avg_score_clamped_game1 = 0
        reward_history_game1 = []
        reward_clamped_history_game1 = []
        # Scores game 2
        avg_score_game2 = 0
        avg_score_clamped_game2 = 0
        reward_history_game2 = []
        reward_clamped_history_game2 = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game (first):               {}\n'.format(self.game1) +
                     'Trained game (second):              {}\n'.format(self.game2) +
                     'Learning rate:                      {:.2E}\n'.format(self.learning_rate) +
                     'Batch size:                         {:d}\n'.format(self.batch_size) +
                     'Memory size(replay):                {:d}\n'.format(self.mem_size) +
                     'Pretrained:                         {}\n'.format(self.pretrained_model) +
                     'Pretrained subnet 1:                {}\n'.format(self.pretrained_subnet1) +
                     'Pretrained subnet 2:                {}\n'.format(self.pretrained_subnet2) +
                     'Started training after k frames:    {:d}\n'.format(self.start_train_after) +
                     'Optimized after k frames:           {:d}\n'.format(self.optimize_each_k) +
                     'Target net update after k frame:    {:d}\n\n'.format(self.update_target_net_each_k_steps) +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                     'Episode | Steps     | ' +
                     '{:3} games avg total  | '.format(log_avg_episodes) +
                     '{:3} games avg game1  | '.format(log_avg_episodes) +
                     '{:3} games avg game2  | '.format(log_avg_episodes) +
                     'best score total \n' +
                     '--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n')

        print('Started training...\nLogging to {}\n'.format(sub_dir) +
              'Episode | Steps     |   score total        |   score game 1       |   ' +
              'score game 2       | best score total')

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen1 = self.env1.reset()
            screen2 = self.env2.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = torch.cat((gray2pytorch(screen1),
                                              gray2pytorch(screen2)), dim=1)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done1 = False
            done2 = False

            # reset score with initial lives, because every lost live adds -1
            total_reward_game1 = 0
            total_reward_clamped_game1 = self.env1.get_lives()
            total_reward_game2 = 0
            total_reward_clamped_game2 = self.env2.get_lives()
            # total scores for both games
            total_reward = total_reward_game1 + total_reward_game2
            total_reward_clamped = total_reward_clamped_game1 + total_reward_clamped_game2

            # Loop over one game
            while not done1 and not done2:
                self.steps +=1

                action = self.select_action(state)
                action1 = self.map_action(action[0,0])
                action2 = action[0,0]

                # perform selected action on game
                screen1, reward1, reward1_clamped, done1, _ = self.env1.step(action1)
                screen2, reward2, reward2_clamped, done2, _ = self.env2.step(action2)

                # Logging
                total_reward_game1 += int(reward1)
                total_reward_game2 += int(reward2)
                total_reward += int(reward1) + int(reward2)
                total_reward_clamped_game1 += reward1_clamped
                total_reward_clamped_game2 += reward2_clamped
                total_reward_clamped += reward1_clamped + reward2_clamped

                # Bake reward into tensor
                reward = torch.FloatTensor([reward1_clamped+reward2_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = torch.cat((gray2pytorch(screen1),
                                                                     gray2pytorch(screen2)), dim=1)

                # convert frames to range 0 to 1 again
                if not done1 and not done2:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity,
                                            action, reward, done1 or done2)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    self.optimize(net_updates)
                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done1 or done2:
                    break;

            # Save rewards
            reward_history_game1.append(total_reward_game1)
            reward_history_game2.append(total_reward_game2)
            reward_history.append(total_reward)
            reward_clamped_history_game1.append(total_reward_clamped_game1)
            reward_clamped_history_game2.append(total_reward_clamped_game2)
            reward_clamped_history.append(total_reward_clamped)

            # Sum up for averages
            avg_score_clamped_game1 += total_reward_clamped_game1
            avg_score_clamped_game2 += total_reward_clamped_game2
            avg_score_clamped += total_reward_clamped
            avg_score_game1 += total_reward_game1
            avg_score_game2 += total_reward_game2
            avg_score += total_reward

            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            print('{:7} | '.format(i_episode) +
                  '{:9} |     '.format(self.steps) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped,total_reward) +
                  '({:6.1f}/{:7.1f}) |     '.format(total_reward_clamped_game1,total_reward_game1) +
                  '({:6.1f}/{:7.1f}) |  '.format(total_reward_clamped_game2,total_reward_game2) +
                  '({:6.1f}/{:8.1f})'.format(best_score_clamped, best_score))

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped_game1 /= log_avg_episodes
                avg_score_clamped_game2 /= log_avg_episodes
                avg_score_clamped /= log_avg_episodes
                avg_score_game1 /= log_avg_episodes
                avg_score_game2 /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('--------+-----------+----------------------+------------' +
                     '----------+----------------------+--------------------\n' +
                      'Episode | Steps     | ' +
                      '{:3} games avg total  | '.format(log_avg_episodes) +
                      '{:3} games avg game1  | '.format(log_avg_episodes) +
                      '{:3} games avg game2  | '.format(log_avg_episodes) +
                      'best score total \n' +
                      '{:7} | '.format(i_episode) +
                      '{:9} |     '.format(self.steps) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                      '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                      '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                      '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score) +
                      '\nLogging to file...\n\n'
                      '--------+-----------+----------------------+------------' +
                      '----------+----------------------+--------------------\n' +
                      'Episode | Steps     |   score total        |   score game 1       |   ' +
                      'score game 2       | best score total')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('{:7} | '.format(i_episode) +
                             '{:9} |     '.format(self.steps) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped,avg_score) +
                             '({:6.1f}/{:7.1f}) |     '.format(avg_score_clamped_game1,avg_score_game1) +
                             '({:6.1f}/{:7.1f}) |  '.format(avg_score_clamped_game2,avg_score_game2) +
                             '({:6.1f}/{:8.1f})\n'.format(best_score_clamped, best_score))
                # Dump reward
                with open(reward_file_game1, 'wb') as fp:
                    pickle.dump(reward_history_game1, fp)
                with open(reward_file_game2, 'wb') as fp:
                    pickle.dump(reward_history_game2, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)

                with open(reward_clamped_file_game1, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game1, fp)
                with open(reward_clamped_file_game2, 'wb') as fp:
                    pickle.dump(reward_clamped_history_game2, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped_game1 = 0
                avg_score_clamped_game2 = 0
                avg_score_clamped = 0
                avg_score_game1 = 0
                avg_score_game2 = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + 'final.model')
示例#2
0
class SingleAgent(object):
    def __init__(self,
                 game,
                 mem_size = 1000000,
                 state_buffer_size = 4,
                 batch_size = 64,
                 learning_rate = 1e-5,
                 pretrained_model = None,
                 frameskip = 4
                 ):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # Environment
        self.env = Environment(game_name[game], dimensions[game], frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())

        self.target_net = DQN(channels_in = state_buffer_size,
                       num_actions = self.env.get_number_of_actions())
        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr=learning_rate,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 1
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size, num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 50000
        else:
            self.start_train_after = mem_size//2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500


    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        EPSILON_PLAY = 0.01
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (EPSILON_START - EPSILON_END) / EPSILON_DECAY
        elif mode=='play':
            epsilon = EPSILON_PLAY
        else:
            epsilon = EPSILON_END

        if epsilon < random():
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1,1)

            # Prevent noops
            if action[0,0]!=1:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0,0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:
            # Random action
            action = self.env.sample_action()
            action = LongTensor([[action]])

        return action


    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition
        batch = self.replay.sampleTransition(self.batch_size)

        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(torch.cat([ns for ns in batch.next_state if ns is not None]),
                                              volatile=True) # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(self.batch_size).type(FloatTensor), volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(1)[0]
        next_state_values[non_final_mask]= next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates%self.update_target_net_each_k_steps==0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]


    def play(self, n):
        """
        Play a game with the current net and render it

        Inputs:
        - n: games to play
        """
        for i in range(n):
            done = False # games end indicator variable
            score = 0
            # Reset game
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            while not done:
                action = self.select_action(state, mode='play')[0,0]

                screen, reward, _, done, _ = self.env.step(action, mode='play')
                score += reward

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                self.state = state
            print('Game ({}/{}) - Final score {}: {}'.format(i+1, n, self.game, score))
        self.env.game.close()


    def play_stats(self, n_games, mode='random'):
        """
        Play N games randomly or evaluate a net and log results for statistics

        Input:
        - n_games: int Number of games to play
        - mode: str 'random' or 'evaluation'
        """
        # Subdirectory for logging
        sub_dir = mode + '_' + self.game + '/'
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # Store history
        reward_history = []
        reward_clamped_history = []

        # Number of actions to sample from
        n_actions = self.env.get_number_of_actions()

        for i_episode in range(1, n_games+1):
            # Reset game
            screen = self.env.reset()

            # Store screen
            if mode=='evaluation':
                # list of k last frames
                last_k_frames = []
                for j in range(self.num_stored_frames):
                    last_k_frames.append(None)
                    last_k_frames[j] = gray2pytorch(screen)
                # frame is saved as ByteTensor -> convert to gray value between 0 and 1
                state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            # games end indicator variable
            done = False

            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            while not done:
                if mode=='random':
                    action = randrange(n_actions)
                elif mode=='evaluation':
                    action = self.select_action(state, mode='play')[0,0]

                screen, reward, reward_clamped, done, _ = self.env.step(action)
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                if mode=='evaluation':
                    #   save latest frame, discard oldest
                    for j in range(self.num_stored_frames-1):
                        last_k_frames[j] = last_k_frames[j+1]
                    last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                    # convert frames to range 0 to 1 again
                    state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0



            # Print current result
            print('Episode: {:6}/{:6} |  '.format(i_episode, n_games),
                  'score: ({:4}/{:4})'.format(total_reward_clamped,total_reward))

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

        avg_reward = np.sum(reward_history)/len(reward_history)
        avg_reward_clamped = np.sum(reward_clamped_history)/len(reward_clamped_history)

        # Print final result
        print('\n\n=============================================\n' +
              'avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))

        # Log results to files
        with open(sub_dir + mode + '.txt', 'w') as fp:
            fp.write('avg score after {:6} episodes: ({:.2f}/{:.2f})\n'.format(n_games, avg_reward_clamped, avg_reward))
        with open(sub_dir + mode + '_reward.pickle', 'wb') as fp:
            pickle.dump(reward_history, fp)
        with open(sub_dir + mode + '_reward_clamped.pickle', 'wb') as fp:
            pickle.dump(reward_clamped_history, fp)


    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime('%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.txt'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(datetime.now().strftime('%Y-%m-%d %H:%M:%S') + '\n' +
                     'Trained game:                       ' + str(self.game) + '\n' +
                     'Learning rate:                      ' + str(self.learning_rate) + '\n' +
                     'Batch size:                         ' + str(self.batch_size) + '\n' +
                     'Memory size(replay):                ' + str(self.mem_size) + '\n' +
                     'Pretrained:                         ' + str(self.pretrained_model) + '\n' +
                     'Started training after k frames:    ' + str(self.start_train_after) + '\n' +
                     'Optimized after k frames:           ' + str(self.optimize_each_k) + '\n' +
                     'Target net update after k frame:    ' + str(self.update_target_net_each_k_steps) + '\n\n' +
                     '------------------------------------------------------' +
                     '--------------------------------------------------\n')

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1,num_episodes):
            # reset game at the start of each episode
            screen = self.env.reset()

            # list of k last frames
            last_k_frames = []
            for j in range(self.num_stored_frames):
                last_k_frames.append(None)
                last_k_frames[j] = gray2pytorch(screen)

            if i_episode == 1:
                self.replay.pushFrame(last_k_frames[0].cpu())

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0

            done = False # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = 0
            total_reward_clamped = self.env.get_lives()

            # Loop over one game
            while not done:
                self.steps +=1

                action = self.select_action(state)

                # perform selected action on game
                screen, reward, reward_clamped, done, _ = self.env.step(action[0,0])
                total_reward += int(reward)
                total_reward_clamped += int(reward_clamped)

                # Wrap into tensor
                reward = torch.Tensor([reward_clamped])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames-1):
                    last_k_frames[j] = last_k_frames[j+1]
                last_k_frames[self.num_stored_frames-1] = gray2pytorch(screen)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state = torch.cat(last_k_frames,1).type(FloatTensor)/255.0
                else:
                    next_state = None

                # Store transition
                self.replay.pushFrame(last_k_frames[self.num_stored_frames - 1].cpu())
                self.replay.pushTransition((self.replay.getCurrentIndex()-1)%self.replay.capacity, action, reward, done)

                #	only optimize each kth step
                if self.steps%self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break;

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print('Episode: {:6} |  '.format(i_episode),
                  'steps {:8} |  '.format(self.steps),
                  'loss: {:.2E} |  '.format(loss if loss else 0),
                  'score: ({:4}/{:4}) |  '.format(total_reward_clamped,total_reward),
                  'best score: ({:4}/{:4}) |  '.format(best_score_clamped,best_score),
                  'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode!=0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print('----------------------------------------------------------------'
                      '-----------------------------------------------------------------',
                      '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                      'steps: {:8}   '.format(self.steps),
                      'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(log_avg_episodes, avg_score_clamped,avg_score),
                      'best score: ({:4}/{:4})'.format(best_score_clamped, best_score),
                      '\n---------------------------------------------------------------'
                      '------------------------------------------------------------------')
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write('Episode: {:6} |  '.format(i_episode) +
                             'steps: {:8} |  '.format(self.steps) +
                             'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(log_avg_episodes, avg_score_clamped,avg_score) +
                             'best score: ({:4}/{:4})\n'.format(best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) + '...\n')
                self.target_net.save(sub_dir + self.game + '-' + str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')
示例#3
0
class Agent(object):
    def __init__(
            self,
            game,
            mem_size=512 * 512,  #1024*512,
            state_buffer_size=4,
            batch_size=64,
            learning_rate=1e-5,
            pretrained_model=None,
            frameskip=4,  #1
            record=False):
        """
        Inputs:
        - game: string to select the game
        - mem_size: int length of the replay memory
        - state_buffer_size: int number of recent frames used as input for neural network
        - batch_size: int
        - learning_rate: float
        - pretrained_model: str path to the model
        - record: boolean to enable record option
        """

        # Namestring
        self.game = game

        # dimensions: tuple (h1,h2,w1,w2) with dimensions of the game (to crop borders)
        #if self.game == 'Breakout-v0':
        #    dimensions = (32, 195, 8, 152)
        #elif self.game == 'SpaceInvaders-v0':
        #    dimensions = (21, 195, 20, 141)
        #elif self.game == 'Assault-v0':
        #    dimensions = (50, 240, 5, 155)
        #elif self.game == 'Phoenix-v0':
        #    dimensions = (23, 183, 0, 160)
        #elif self.game == 'Skiing-v0':
        #    dimensions = (55, 202, 8, 152)
        #elif self.game == 'Enduro-v0':
        #    dimensions = (50, 154, 8, 160)
        #elif self.game == 'BeamRider-v0':
        #    dimensions = (32, 180, 9, 159)

        if self.game == 'BreakoutAndSpace':
            dimensions_break = (32, 195, 8, 152)
            dimensions_space = (21, 195, 20, 141)
        elif self.game != 'BreakoutAndSpace':
            print(
                'Error! This version is for playing BreakOut and SpaceInvaders at the same time.'
            )

        # Environment
        self.env_break = Environment('BreakoutNoFrameskip-v4',
                                     dimensions_break,
                                     frameskip=frameskip)
        self.env_space = Environment('SpaceInvaders-v0',
                                     dimensions_space,
                                     frameskip=frameskip)

        # Cuda
        self.use_cuda = torch.cuda.is_available()

        # Neural network
        self.net = DQN(channels_in=state_buffer_size,
                       num_actions=self.env_space.get_number_of_actions())

        self.target_net = DQN(
            channels_in=state_buffer_size,
            num_actions=self.env_space.get_number_of_actions())

        if self.use_cuda:
            self.net.cuda()
            self.target_net.cuda()

        if pretrained_model:
            self.net.load(pretrained_model)
            self.target_net.load(pretrained_model)
            self.pretrained_model = True
        else:
            self.pretrained_model = False

        # Optimizer
        self.learning_rate = learning_rate
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
        #self.optimizer = optim.RMSprop(self.net.parameters(), lr = 0.00025,alpha=0.95, eps=0.01)

        self.batch_size = batch_size
        self.optimize_each_k = 4
        self.update_target_net_each_k_steps = 10000
        self.noops_count = 0

        # Replay Memory (Long term memory)
        self.replay = ReplayMemory(capacity=mem_size,
                                   num_history_frames=state_buffer_size)
        self.mem_size = mem_size

        # Fill replay memory before training
        if not self.pretrained_model:
            self.start_train_after = 25000
        else:
            self.start_train_after = mem_size // 2

        # Buffer for the most recent states (Short term memory)
        self.num_stored_frames = state_buffer_size

        # Steps
        self.steps = 0

        # Save net
        self.save_net_each_k_episodes = 500

    def select_action(self, observation, mode='train'):
        """
        Select an random action from action space or an proposed action
        from neural network depending on epsilon

        Inputs:
        - observation: np.array with the observation

        Returns:
        action: int
        """
        # Hyperparameters
        EPSILON_START = 1
        EPSILON_END = 0.1
        EPSILON_DECAY = 1000000
        MAXNOOPS = 30

        # Decrease of epsilon value
        if not self.pretrained_model:
            #epsilon = EPSILON_END + (EPSILON_START - EPSILON_END) * \
            #                        np.exp(-1. * (self.steps-self.batch_size) / EPSILON_DECAY)
            epsilon = EPSILON_START - self.steps * (
                EPSILON_START - EPSILON_END) / EPSILON_DECAY
        else:
            epsilon = EPSILON_END

        if epsilon > random() or mode == 'play':
            # Action according to neural net
            # Wrap tensor into variable
            state_variable = Variable(observation, volatile=True)

            # Evaluate network and return action with maximum of activation
            action = self.net(state_variable).data.max(1)[1].view(1, 1)

            # Prevent noops
            if action[0, 0] == 0:
                self.noops_count += 1
                if self.noops_count == MAXNOOPS:
                    action[0, 0] = 1
                    self.noops_count = 0
            else:
                self.noops_count = 0
        else:

            # Random action
            action = self.env_space.sample_action()
            action = LongTensor([[action]])

        return action

    def optimize(self, net_updates):
        """
        Optimizer function

        Inputs:
        - net_updates: int

        Returns:
        - loss: float
        - q_value: float
        - exp_q_value: float
        """
        # Hyperparameter
        GAMMA = 0.99

        #   not enough memory yet
        if len(self.replay) < self.start_train_after:
            return

        # Sample a transition

        batch = self.replay.sampleTransition(self.batch_size)
        # Mask to indicate which states are not final (=done=game over)
        non_final_mask = ByteTensor(
            list(map(lambda ns: ns is not None, batch.next_state)))

        # Wrap tensors in variables
        state_batch = Variable(torch.cat(batch.state))
        action_batch = Variable(torch.cat(batch.action))
        reward_batch = Variable(torch.cat(batch.reward))
        non_final_next_states = Variable(
            torch.cat([ns for ns in batch.next_state if ns is not None]),
            volatile=True
        )  # volatile==true prevents calculation of the derivative

        next_state_values = Variable(torch.zeros(
            self.batch_size).type(FloatTensor),
                                     volatile=False)

        if self.use_cuda:
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            non_final_mask = non_final_mask.cuda()
            non_final_next_states = non_final_next_states.cuda()
            next_state_values = next_state_values.cuda()

        # Compute Q(s_t, a) - the self.net computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        next_max_values = self.target_net(non_final_next_states).detach().max(
            1)[0]
        next_state_values[non_final_mask] = next_max_values

        # Compute the expected Q values
        expected_state_action_values = (next_state_values *
                                        GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        self.optimizer.zero_grad()

        loss.backward()
        for param in self.net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        if net_updates % self.update_target_net_each_k_steps == 0:
            self.target_net.load_state_dict(self.net.state_dict())
            print('target_net update!')

        return loss.data.cpu().numpy()[0]

    def play(self):
        """
        Play a game with the current net and render it
        """
        done = False  # games end indicator variable
        score = 0
        # Reset game
        screen_break = self.env_break.reset()
        screen_space = self.env_space.reset()

        # list of k last frames
        ############old version:
        #breakout part
        #last_k_frames_break = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_break.append(None)
        #    last_k_frames_break[j] = gray2pytorch(screen_break)
        #spaceinvaders part
        #last_k_frames_space = []
        #for j in range(self.num_stored_frames):
        #    last_k_frames_space.append(None)
        #    last_k_frames_space[j] = gray2pytorch(screen_space)
        #################
        last_k_frames = []
        for j in range(self.num_stored_frames):
            last_k_frames.append(None)
            last_k_frames[j] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

        # frame is saved as ByteTensor -> convert to gray value between 0 and 1
        ############old version:
        #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
        #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
        #state = torch.cat((state_break,state_space), 2)
        state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0

        while not done:
            action = self.select_action(state, mode='play')

            # Render game
            self.env_break.game.render(mode='human')
            self.env_space.game.render(mode='human')

            # maps actions from space invaders to breakout (shot-left to left, shot-right to right)
            if action[0, 0] == 4:
                action_break = 2
            elif action[0, 0] == 5:
                action_break = 3
            elif action[0, 0] != 5:
                action_break = action[0, 0]

            screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                action_break, mode='play')
            screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                action[0, 0], mode='play')
            score += reward_break
            score += reward_space
            done = done_break or done_space

            ############old
            #   save latest frame, discard oldest
            #for j in range(self.num_stored_frames - 1):
            #    last_k_frames_break[j] = last_k_frames_break[j + 1]
            #    last_k_frames_space[j] = last_k_frames_space[j + 1]
            #last_k_frames_break[self.num_stored_frames - 1] = gray2pytorch(screen_break)
            #last_k_frames_space[self.num_stored_frames - 1] = gray2pytorch(screen_space)

            # convert frames to range 0 to 1 again
            #state_break = torch.cat(last_k_frames_break, 1).type(FloatTensor) / 255.0
            #state_space = torch.cat(last_k_frames_space, 1).type(FloatTensor) / 255.0
            #state = torch.cat((state_break, state_space), 2)
            #############old_end

            #   save latest frame, discard oldest
            for j in range(self.num_stored_frames - 1):
                last_k_frames[j] = last_k_frames[j + 1]
            last_k_frames[self.num_stored_frames - 1] = torch.cat(
                (gray2pytorch(screen_break), gray2pytorch(screen_space)),
                dim=2)

            # convert frames to range 0 to 1 again
            state = torch.cat(last_k_frames, 1).type(FloatTensor) / 255.0
            done = done_break or done_space

        print('Final score:', score)
        self.env.game.close()  #for both changen

    def train(self):
        """
        Train the agent
        """
        num_episodes = 100000
        net_updates = 0

        # Logging
        sub_dir = self.game + '_' + datetime.now().strftime(
            '%Y%m%d_%H%M%S') + '/'
        os.makedirs(sub_dir)
        logfile = sub_dir + self.game + '_train.log'
        loss_file = sub_dir + 'loss.pickle'
        reward_file = sub_dir + 'reward.pickle'
        reward_clamped_file = sub_dir + 'reward_clamped.pickle'
        log_avg_episodes = 50

        best_score = 0
        best_score_clamped = 0
        avg_score = 0
        avg_score_clamped = 0
        loss_history = []
        reward_history = []
        reward_clamped_history = []

        # Initialize logfile with header
        with open(logfile, 'w') as fp:
            fp.write(
                datetime.now().strftime('%Y%m%d_%H%M%S') + '\n' +
                'Trained game:    ' + str(self.game) + '\n' +
                'Learning rate:    ' + str(self.learning_rate) + '\n' +
                'Batch size:    ' + str(self.batch_size) + '\n' +
                'Pretrained:    ' + str(self.pretrained_model) + '\n' +
                'Started training after k frames:    ' +
                str(self.start_train_after) + '\n' +
                'Optimized after k frames:    ' + str(self.optimize_each_k) +
                '\n' + 'Target net update after k frame:    ' +
                str(self.update_target_net_each_k_steps) + '\n\n' +
                '--------------------------------------------------------------------------------\n'
            )

        print('Started training...\nLogging to', sub_dir)

        for i_episode in range(1, num_episodes):

            # reset game at the start of each episode
            screen_break = self.env_break.reset()
            screen_space = self.env_space.reset()

            # list of k last frames
            last_k_frames_break = []
            last_k_frames_space = []
            for j in range(self.num_stored_frames):
                last_k_frames_break.append(None)
                last_k_frames_space.append(None)
                last_k_frames_break[j] = gray2pytorch(screen_break)
                last_k_frames_space[j] = gray2pytorch(screen_space)

            if i_episode == 1:
                frames_both = torch.cat((last_k_frames_break[0].cpu(),
                                         last_k_frames_space[0].cpu()), 2)
                #self.replay.pushFrame(last_k_frames_break[0].cpu())
                #self.replay.pushFrame(last_k_frames_space[0].cpu())
                self.replay.pushFrame(frames_both)

            # frame is saved as ByteTensor -> convert to gray value between 0 and 1
            state_break = torch.cat(last_k_frames_break,
                                    1).type(FloatTensor) / 255.0
            state_space = torch.cat(last_k_frames_space,
                                    1).type(FloatTensor) / 255.0
            state = torch.cat((state_break, state_space), 2)

            done = False  # games end indicator variable
            # reset score with initial lives, because every lost live adds -1
            total_reward = self.env_break.get_lives()
            total_reward += self.env_space.get_lives()
            total_reward_clamped = self.env_break.get_lives()
            total_reward_clamped += self.env_space.get_lives()
            ###########

            # Loop over one game
            while not done:
                self.steps += 1
                action = self.select_action(state)
                # perform selected action on game
                # screen, reward, done, info = self.env.step(action[0,0])#envTest.step(action[0,0])
                #maps actions from space invaders to breakout (shot-left to left, shot-right to right)

                screen_space, _, reward_space, done_space, info_space = self.env_space.step(
                    action[0, 0])

                action_break = action[0, 0]
                if action_break > 3:  #shoot+right/left --> right/left
                    action_break = action_break - 2
                screen_break, _, reward_break, done_break, info_break = self.env_break.step(
                    action_break)

                total_reward += int(reward_break)
                total_reward += int(reward_space)
                done = done_break or done_space

                #   clamp rewards
                reward_break = torch.Tensor([np.clip(reward_break, -1, 1)])
                reward_space = torch.Tensor([np.clip(reward_space, -1, 1)])
                reward = reward_break + reward_space
                total_reward_clamped += int(reward_break[0])
                total_reward_clamped += int(reward_space[0])

                #   save latest frame, discard oldest
                for j in range(self.num_stored_frames - 1):
                    last_k_frames_break[j] = last_k_frames_break[j + 1]
                    last_k_frames_space[j] = last_k_frames_space[j + 1]
                last_k_frames_break[self.num_stored_frames -
                                    1] = gray2pytorch(screen_break)
                last_k_frames_space[self.num_stored_frames -
                                    1] = gray2pytorch(screen_space)

                # convert frames to range 0 to 1 again
                if not done:
                    next_state_break = torch.cat(last_k_frames_break,
                                                 1).type(FloatTensor) / 255.0
                    next_state_space = torch.cat(last_k_frames_space,
                                                 1).type(FloatTensor) / 255.0
                    next_state = torch.cat(
                        (next_state_break, next_state_space), 2)
                else:
                    next_state = None

                #Store transition
                #Frame concat, Trasition not (try)
                frame_break = last_k_frames_break[self.num_stored_frames -
                                                  1].cpu()
                frame_space = last_k_frames_space[self.num_stored_frames -
                                                  1].cpu()
                frame_both = torch.cat((frame_break, frame_space), 2)
                self.replay.pushFrame(frame_both)
                self.replay.pushTransition(
                    (self.replay.getCurrentIndex() - 1) % self.replay.capacity,
                    action, reward, done)

                #	only optimize each kth step
                if self.steps % self.optimize_each_k == 0:
                    loss = self.optimize(net_updates)

                    # Logging
                    loss_history.append(loss)
                    #q_history.append(q_value)
                    #exp_q_history.append(exp_q_value)

                    net_updates += 1

                # set current state to next state to select next action
                if next_state is not None:
                    state = next_state

                if self.use_cuda:
                    state = state.cuda()

                # plays episode until there are no more lives left ( == done)
                if done:
                    break

            # Save rewards
            reward_history.append(total_reward)
            reward_clamped_history.append(total_reward_clamped)

            print(
                'Episode: {:6} |  '.format(i_episode),
                'steps {:8} |  '.format(self.steps),
                'loss: {:.2E} |  '.format(loss if loss else 0),
                'score: ({:4}/{:4}) |  '.format(total_reward_clamped,
                                                total_reward),
                'best score: ({:4}/{:4}) |  '.format(best_score_clamped,
                                                     best_score),
                'replay size: {:7}'.format(len(self.replay)))

            avg_score_clamped += total_reward_clamped
            avg_score += total_reward
            if total_reward_clamped > best_score_clamped:
                best_score_clamped = total_reward_clamped
            if total_reward > best_score:
                best_score = total_reward

            if i_episode % log_avg_episodes == 0 and i_episode != 0:
                avg_score_clamped /= log_avg_episodes
                avg_score /= log_avg_episodes

                print(
                    '----------------------------------------------------------------'
                    '-----------------------------------------------------------------',
                    '\nLogging to file: \nEpisode: {:6}   '.format(i_episode),
                    'steps: {:8}   '.format(self.steps),
                    'avg on last {:4} games ({:6.1f}/{:6.1f})   '.format(
                        log_avg_episodes, avg_score_clamped, avg_score),
                    'best score: ({:4}/{:4})'.format(best_score_clamped,
                                                     best_score),
                    '\n---------------------------------------------------------------'
                    '------------------------------------------------------------------'
                )
                # Logfile
                with open(logfile, 'a') as fp:
                    fp.write(
                        'Episode: {:6} |  '.format(i_episode) +
                        'steps: {:8} |  '.format(self.steps) +
                        'avg on last {:4} games ({:6.1f}/{:6.1f}) |  '.format(
                            log_avg_episodes, avg_score_clamped, avg_score) +
                        'best score: ({:4}/{:4})\n'.format(
                            best_score_clamped, best_score))
                # Dump loss & reward
                with open(loss_file, 'wb') as fp:
                    pickle.dump(loss_history, fp)
                with open(reward_file, 'wb') as fp:
                    pickle.dump(reward_history, fp)
                with open(reward_clamped_file, 'wb') as fp:
                    pickle.dump(reward_clamped_history, fp)

                avg_score_clamped = 0
                avg_score = 0

            if i_episode % self.save_net_each_k_episodes == 0:
                with open(logfile, 'a') as fp:
                    fp.write('Saved model at episode ' + str(i_episode) +
                             '...\n')
                self.target_net.save(sub_dir + self.game + '-' +
                                     str(i_episode) + '_episodes.model')

        print('Training done!')
        self.target_net.save(sub_dir + self.game + '.model')