Python NeuralNet.train примеры использования

Язык программирования: Python

Пространство имен/Пакет: ai.NeuralNet

Класс/Тип: NeuralNet

Метод/Функция: train

Примеров на hotexamples.com: 9

Python NeuralNet.train - 9 примеров найдено. Это лучшие примеры Python кода для ai.NeuralNet.NeuralNet.train, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

train(5)

NeuralNet(4)

predict(2)

get_weight_stats(1)

predict_best_action(1)

predict_rewards(1)

q_vals(1)

save_network(1)

save_state(1)

sync_with_host(1)

Пример #1

Показать файл

class Main:
    def __init__(self):
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # XXX Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84],
                              filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2],
                              n_hidden=256,
                              n_out=self.number_of_actions)
        self.ale = ALE()
        self.frames_played = 0
        self.iterations_per_choice = 4
        self.games = games.Games()

    def compute_epsilon(self):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1 - 0.9 * (float(self.frames_played) / 1e6), 0.1)

    def play_game(self, training_callback):
        """Play  a game,  calling  training_callback every  iteration.
        Returns a history of the game."""
        game = games.Game(self.ale)
        while not game.game_over():
            if ((random.uniform(0, 1) < self.compute_epsilon())
                    or (game.number_frames() < 4)):
                action = random.choice(range(self.number_of_actions))
                print 'chose randomly', action,
            else:
                rewards = self.nnet.predict_rewards([game.last_state()])
                action = np.argmax(rewards)
                print 'chose q-func  ', action, 'based on rewards', rewards,
            self.frames_played += 1
            game.move(action)
            print 'reward', game.rewards[-1]
            if (self.frames_played % self.minibatch_size) == 0:
                training_callback()
        return game

    def play_games(self):
        while True:
            if self.games.number_unprocessed() < 20 * self.minibatch_size:
                training_callback = lambda: None
            else:
                training_callback = lambda: self.nnet.train(
                    self.games.get_minibatch(self.minibatch_size))
            print 'Currently', len(self.games.games) + 1, 'games'
            self.games.add_game(self.play_game(training_callback))

Пример #2

Показать файл

Файл: main.py Проект: RichardKelley/Replicating-DeepMind

class Main:
    # How many transitions to keep in memory?
    memory_size = 300000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one state is four 84x84 screens
    state_size = 4*84*84

    # Discount factor for future rewards
    discount_factor = 0.9

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / (1000000 * 1.0), 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        scores = self.nnet.predict(last_state)
        assert scores.shape[1] == self.number_of_actions

        self.output_file.write(str(scores).strip().replace(' ', ',')[2:-2] + '\n')
        self.output_file.flush()
        
        # return action (index) with maximum score
        return np.argmax(scores)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict scores for poststates
        post_scores = self.nnet.predict(poststates)
        assert post_scores.shape[0] == self.minibatch_size
        assert post_scores.shape[1] == self.number_of_actions

        # take maximum score of all actions
        max_scores = np.max(post_scores, axis=1)
        assert max_scores.shape[0] == self.minibatch_size
        assert len(max_scores.shape) == 1

        # predict scores for prestates, so we can keep scores for other actions unchanged
        scores = self.nnet.predict(prestates)
        assert scores.shape[0] == self.minibatch_size
        assert scores.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            scores[i][action] = rewards[i] + self.discount_factor * max_scores[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, scores.transpose().copy())
        return cost

    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0
        game_scores = []
        scores_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".txt", "w")
        self.output_file = open("../log/Q_history"+time.strftime("%Y-%m-%d-%H-%M")+".csv","w")

        # Play games until maximum number is reached
        while games_played < games_to_play:

            # Start a new game
            self.ale.new_game()
            print "starting game", games_played+1, "frames played so far:", frames_played
            game_score = 0
            self.nnet.epoch = games_played

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)

                # Before AI takes an action we must make sure it is safe for the human race
                if   injury_to_a_human_being    is not None:
                    raise Exception('The First Law of Robotics is violated!')
                elif conflict_with_orders_given is not None:
                    raise Exception('The Second Law of Robotics is violated!')
                elif threat_to_my_existence     is not None:
                    raise Exception('The Third Law of Robotics is violated!')

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))

                # Usually neural net chooses the best action
                else:
                    action = self.predict_best_action(self.memory.get_last_state())

                # Make the move
                reward = self.ale.move(action)
                game_score += reward

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                frames_played += 1

            # After "game over" increase the number of games played
            games_played += 1
            
            # Store game state every 100 games
            if games_played % 100 == 0:

                # Store state of the network as cpickle as Convnet does
                self.nnet.sync_with_host()
                self.nnet.save_state()
            
                # Store the weights and biases of all layers
                layers_list=["layer1","layer2","layer3","layer4"]
                layer_dict = {}
                for layer_name in layers_list:
                    w = m.nnet.layers[layer_name]["weights"][0].copy()
                    b = m.nnet.layers[layer_name]["biases"][0].copy()
                    layer_dict[layer_name] = {'weights': w, 'biases': b}
                filename = "../log/weights_at_" + str(games_played) + "_games.pkl"
                weights_file = open(filename, "wb")
                cPickle.dump(layer_dict, weights_file)
                weights_file.close()

            # write the game score to a file 
            scores_file.write(str(game_score)+"\n")
            scores_file.flush()

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()

        print game_scores
        scores_file.close()

Пример #3

Показать файл

Файл: main.py Проект: Juxi/Replicating-DeepMind

class Main:
    # How many transitions to keep in memory?
    memory_size = 500000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 6 for "Breakout"
    number_of_actions = 6

    # Size of one frame
    frame_size = 84*84

    # Size of one state is four 84x84 screens
    state_size = 4 * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Epsilon during testing
    test_epsilon = 0.05

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)

        # use neural net to predict Q-values for all actions
        qvalues = self.nnet.predict(last_state)
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        #print "Prestate q-values: ", qvalues[0,:]
        #print "Action was: %d, reward was %d" % (actions[0], rewards[0])

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        #print "Poststate q-values: ", post_qvalues[0,:]

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis = 1)

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]
        #print "Corrected q-values: ", qvalues[0,:]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())

        #qvalues = self.nnet.predict(prestates)
        #print "After training: ", qvalues[0,:]

        return cost

    def play_games(self, nr_frames, train, epsilon = None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """
        assert train or epsilon is not None

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            points = self.ale.move(action)
            if points > 0:
                print "    Got %d points" % points
            game_score += points
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Only if training
            if train:
                # Store new information to memory
                self.ale.store_step(action)
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Fetch random minibatch from memory
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                # Train neural net with the minibatch
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):
        # Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        log_train = open("../log/training_" + timestamp + ".csv", "w")
        log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n")
        log_test = open("../log/testing_" + timestamp + ".csv", "w")
        log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n")
        log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w")
        log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w")
        log_weights = open("../log/weights_" + timestamp + ".csv", "w")

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch

            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, train = True)

                # log training scores
                log_train_scores.write(NL.join(map(str, training_scores)) + NL)
                log_train_scores.flush()

                # log aggregated training data
                train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count)
                log_train.write(','.join(map(str, train_data)) + NL)
                log_train.flush()

                weights = self.nnet.get_weight_stats()
                if epoch == 1:
                    # write header
                    wlayers = []
                    for (layer, index) in weights:
                        wlayers.extend([layer, index, ''])
                    log_weights.write(','.join(wlayers) + NL)
                    wlabels = []
                    for (layer, index) in weights:
                        wlabels.extend(['weights', 'weightsInc', 'incRatio'])
                    log_weights.write(','.join(wlabels) + NL)
                wdata = []
                for w in weights.itervalues():
                    wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)])
                log_weights.write(','.join(wdata) + NL)
                log_weights.flush()

                # save network state
                self.nnet.save_network(epoch)
                print   # save_network()'s output doesn't include newline

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon)

                # log testing scores
                log_test_scores.write(NL.join(map(str, testing_scores)) + NL)
                log_test_scores.flush()

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

                # Do not calculate Q-values when mamory is empty
                if self.random_states is not None:
                    # calculate Q-values 
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis = 1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0

                # log aggregated testing data
                test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count)
                log_test.write(','.join(map(str, test_data)) + NL)
                log_test.flush()

        log_train.close()
        log_test.close()
        log_train_scores.close()
        log_test_scores.close()
        log_weights.close()

Пример #4

Показать файл

Файл: main.py Проект: TongZZZ/Replicating-DeepMind

class Main:
    # How many transitions to keep in memory?
    memory_size = 100000

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    # Size of the mini-batch which will be sent to learning in Theano
    minibatch_size = None

    # Number of possible actions in a given game
    number_of_actions = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84], filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2], n_hidden=256, n_out=self.number_of_actions)
        self.ale = ALE(self.memory)

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.9 - frames_played / (self.memory_size * 1.0), 0.1)


    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0

        # Play games until maximum number is reached
        while games_played < games_to_play:
            # Start a new game
            self.ale.new_game()

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)
                #print "espilon is", epsilon
                # Before AI takes an action we must make sure it is safe for the human race
                if   injury_to_a_human_being    is not None:
                    raise Exception('The First Law of Robotics is violated!')
                elif conflict_with_orders_given is not None:
                    raise Exception('The Second Law of Robotics is violated!')
                elif threat_to_my_existence     is not None:
                    raise Exception('The Third Law of Robotics is violated!')

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))
                    #print "chose randomly ", action

                # Usually neural net chooses the best action
                else:
                    #print "chose by neural net"
                    action = self.nnet.predict_best_action([self.memory.get_last_state()])
                    print action

                # Make the move
                self.ale.move(action)

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session

                self.nnet.train(self.memory.get_minibatch(self.minibatch_size))
                frames_played += 1
            # After "game over" increase the number of games played
            games_played += 1

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()

Пример #5

Показать файл

class Main:
    # How many transitions to keep in memory?
    memory_size = 500000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 6 for "Breakout"
    number_of_actions = 6

    # Size of one frame
    frame_size = 84*84

    # Size of one state is four 84x84 screens
    state_size = 4 * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Epsilon during testing
    test_epsilon = 0.05

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin')
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)

        # use neural net to predict Q-values for all actions
        qvalues = self.nnet.predict(last_state)
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        #print "Prestate q-values: ", qvalues[0,:]
        #print "Action was: %d, reward was %d" % (actions[0], rewards[0])

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        #print "Poststate q-values: ", post_qvalues[0,:]

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis = 1)

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]
        #print "Corrected q-values: ", qvalues[0,:]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())

        #qvalues = self.nnet.predict(prestates)
        #print "After training: ", qvalues[0,:]

        return cost

    def play_games(self, nr_frames, train, epsilon = None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """
        assert train or epsilon is not None

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            points = self.ale.move(action)
            if points > 0:
                print "    Got %d points" % points
            game_score += points
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Only if training
            if train:
                # Store new information to memory
                self.ale.store_step(action)
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Fetch random minibatch from memory
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                # Train neural net with the minibatch
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):
        # Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        log_train = open("../log/training_" + timestamp + ".csv", "w")
        log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n")
        log_test = open("../log/testing_" + timestamp + ".csv", "w")
        log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n")
        log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w")
        log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w")
        log_weights = open("../log/weights_" + timestamp + ".csv", "w")

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch

            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, train = True)

                # log training scores
                log_train_scores.write(NL.join(map(str, training_scores)) + NL)
                log_train_scores.flush()

                # log aggregated training data
                train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count)
                log_train.write(','.join(map(str, train_data)) + NL)
                log_train.flush()

                weights = self.nnet.get_weight_stats()
                if epoch == 1:
                    # write header
                    wlayers = []
                    for (layer, index) in weights:
                        wlayers.extend([layer, index, ''])
                    log_weights.write(','.join(wlayers) + NL)
                    wlabels = []
                    for (layer, index) in weights:
                        wlabels.extend(['weights', 'weightsInc', 'incRatio'])
                    log_weights.write(','.join(wlabels) + NL)
                wdata = []
                for w in weights.itervalues():
                    wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)])
                log_weights.write(','.join(wdata) + NL)
                log_weights.flush()

                # save network state
                self.nnet.save_network(epoch)
                print   # save_network()'s output doesn't include newline

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon)

                # log testing scores
                log_test_scores.write(NL.join(map(str, testing_scores)) + NL)
                log_test_scores.flush()

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

                # Do not calculate Q-values when mamory is empty
                if self.random_states is not None:
                    # calculate Q-values 
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis = 1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0

                # log aggregated testing data
                test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count)
                log_test.write(','.join(map(str, test_data)) + NL)
                log_test.flush()

        log_train.close()
        log_test.close()
        log_train_scores.close()
        log_test_scores.close()
        log_weights.close()

Пример #6

Показать файл

Файл: main.py Проект: wqren/Replicating-DeepMind

class Main:
    # How many transitions to keep in memory?
    memory_size = 1000000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one frame
    frame_size = 84 * 84

    # How many frames form a history
    history_length = 4

    # Size of one state is four 84x84 screens
    state_size = history_length * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # How many frames to play to choose random frame
    init_frames = 1000

    # How many epochs to run
    epochs = 200

    # Number of frames to play during one training epoch
    training_frames = 50000

    # Number of frames to play during one testing epoch
    testing_frames = 10000

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions,
                              "ai/deepmind-layers.cfg",
                              "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        qvalues = self.nnet.predict(last_state)
        assert qvalues.shape[0] == 1
        assert qvalues.shape[1] == self.number_of_actions
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        assert post_qvalues.shape[0] == self.minibatch_size
        assert post_qvalues.shape[1] == self.number_of_actions

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis=1)
        assert max_qvalues.shape[0] == self.minibatch_size
        assert len(max_qvalues.shape) == 1

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        assert qvalues.shape[0] == self.minibatch_size
        assert qvalues.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][
                action] = rewards[i] + self.discount_factor * max_qvalues[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())
        return cost

    def play_games(self, nr_frames, train, epsilon):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            reward = self.ale.move(action)
            if reward:
                print "    Got reward of %d!!!" % reward
                reward = 1
            game_score += reward
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Store new information to memory
            self.ale.store_step(action)

            # Only if training
            if train:
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Train neural net with random minibatch
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "   Game over!!! Score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score)
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self):
        # Play number of random games and pick random states to calculate Q-values for
        print "Playing %d games with random policy" % self.init_frames
        self.play_games(self.init_frames, False, 1)
        self.random_states = self.memory.get_minibatch(
            self.nr_random_states)[0]

        # Open log file and write header
        log_file = open(
            "../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w")
        log_file.write(
            "epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n"
        )

        for epoch in range(1, self.epochs + 1):
            print "Epoch %d:" % epoch
            # play number of frames with training and epsilon annealing
            print "  Training for %d frames" % self.training_frames
            self.play_games(self.training_frames, True, None)
            # play number of frames without training and without epsilon annealing
            print "  Testing for %d frames" % self.testing_frames
            game_scores = self.play_games(self.testing_frames, False, 0.05)

            # calculate Q-values
            qvalues = self.nnet.predict(self.random_states)
            assert qvalues.shape[0] == self.nr_random_states
            assert qvalues.shape[1] == self.number_of_actions
            max_qvalues = np.max(qvalues, axis=1)
            assert max_qvalues.shape[0] == self.nr_random_states
            assert len(max_qvalues.shape) == 1
            avg_qvalue = np.mean(max_qvalues)

            # calculate average scores
            sum_score = sum(game_scores)
            nr_games = len(game_scores)
            avg_score = np.mean(game_scores)
            epsilon = self.compute_epsilon(self.total_frames_trained)

            # log average scores in file
            log_file.write(
                "%d,%d,%f,%f,%d,%f,%d,%f,%d\n" %
                (epoch, nr_games, sum_score, avg_score, self.testing_frames,
                 avg_qvalue, self.total_frames_trained, epsilon,
                 self.memory.count))
            log_file.flush()

        log_file.close()

Пример #7

Показать файл

class Main:
    # How many transitions to keep in memory?
    memory_size = 1000000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 6 for "Breakout"
    number_of_actions = 18

    # Size of one frame
    frame_size = 80 * 80

    # Size of one state is four 80x80 screens
    state_size = 4 * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Epsilon during testing
    test_epsilon = 0.05

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 1000

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    # The last 4 frames the system has seen
    current_state = None

    def __init__(self):
        #self.memory = MemoryD(self.memory_size)
        self.memory = DataSet(80, 80, self.memory_size, 4)
        self.ale = ALE(display_screen="true",
                       skip_frames=4,
                       game_ROM='../libraries/ale/roms/breakout.bin')
        self.nnet = NeuralNet(self.state_size,
                              self.number_of_actions,
                              "ai/deepmind-layers.cfg",
                              "ai/deepmind-params.cfg",
                              "layer4",
                              discount_factor=self.discount_factor)
        #self.nnet = CNNQLearner(self.number_of_actions, 4, 80, 80, discount=self.discount_factor, learning_rate=.0001, batch_size=32, approximator='none')

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.9 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):

        # Uncomment this to see the 4 images that go into q_vals function
        #a = np.hstack(last_state)
        #img = PIL.Image.fromarray(a)
        #img.convert('RGB').save('input_to_nnet.Qvals.png')

        # use neural net to predict Q-values for all actions
        qvalues = self.nnet.q_vals(last_state)
        print "Predicted action Q-values: ", qvalues, "\n best action is", np.argmax(
            qvalues)

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, prestates, actions, rewards, poststates):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """

        cost = self.nnet.train(prestates, actions, rewards, poststates)
        #print "trained network, the network thinks cost is: ", type(cost), np.shape(cost), cost

        return cost

    def play_games(self, nr_frames, train, epsilon=None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """
        assert train or epsilon is not None

        frames_played = 0
        game_scores = []

        # Start a new game
        last_frame = self.ale.new_game()

        # We need to initialize/update the current state
        self.current_state = [
            last_frame.copy(),
            last_frame.copy(),
            last_frame.copy(),
            last_frame.copy()
        ]

        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon or frames_played < 4:
                action = random.choice(range(self.number_of_actions))

            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.current_state)

            # Make the move. Returns points received and the new state
            points, next_frame = self.ale.move(action)

            # Changing points to rewards
            if points > 0:
                print "    Got %d points" % points
                reward = 1
            else:
                reward = 0

            # Book keeping
            game_score += points
            frames_played += 1

            # We need to update the current state
            self.current_state = self.current_state[1:] + [next_frame]

            # Only if training
            if train:

                # Store new information to memory
                self.memory.add_sample(last_frame, action, reward,
                                       self.ale.game_over)
                last_frame = next_frame

                if self.memory.count >= self.minibatch_size:
                    # Fetch random minibatch from memory
                    prestates, actions, rewards, poststates, terminals = self.memory.get_minibatch(
                        self.minibatch_size)

                    # Uncomment this to save the minibatch as an image every time we train
                    #b = []
                    #for a in prestates:
                    #    b.append(np.hstack(a))
                    #c = np.vstack(b)
                    #img = PIL.Image.fromarray(c)
                    #img.convert("RGB").save("minibatch.png")

                    # Train neural net with the minibatch
                    self.train_minibatch(prestates, actions, rewards,
                                         poststates)

                    # Increase total frames only when training
                    self.total_frames_trained += 1

            # Play until game is over
            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score)
                game_score = 0

                # And do stuff after end game
                self.ale.end_game()

                last_frame = self.ale.new_game()

                # We need to update the current state
                self.current_state = self.current_state[1:] + [last_frame]

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):
        # Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        log_train = open("../log/training_" + timestamp + ".csv", "w")
        log_train.write(
            "epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n"
        )
        log_test = open("../log/testing_" + timestamp + ".csv", "w")
        log_test.write(
            "epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n"
        )
        log_train_scores = open("../log/training_scores_" + timestamp + ".txt",
                                "w")
        log_test_scores = open("../log/testing_scores_" + timestamp + ".txt",
                               "w")
        log_weights = open("../log/weights_" + timestamp + ".csv", "w")

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch

            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, train=True)

                # log training scores
                log_train_scores.write(NL.join(map(str, training_scores)) + NL)
                log_train_scores.flush()

                # log aggregated training data
                train_data = (epoch, len(training_scores),
                              sum(training_scores), np.mean(training_scores),
                              training_frames, self.total_frames_trained,
                              self.compute_epsilon(self.total_frames_trained),
                              self.memory.count)
                log_train.write(','.join(map(str, train_data)) + NL)
                log_train.flush()

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                testing_scores = self.play_games(testing_frames,
                                                 train=False,
                                                 epsilon=self.test_epsilon)

                # log testing scores
                log_test_scores.write(NL.join(map(str, testing_scores)) + NL)
                log_test_scores.flush()

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(
                        self.nr_random_states)[0]

                # Do not calculate Q-values when memory is empty
                if self.random_states is not None:
                    # calculate Q-values
                    qvalues = []
                    for state in self.random_states:
                        qvalues.append(self.nnet.q_vals(state))
                    #assert qvalues.shape[0] == self.nr_random_states
                    #assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis=1)
                    #assert max_qvalues.shape[0] == self.nr_random_states
                    #assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0

                # log aggregated testing data
                test_data = (epoch, len(testing_scores), sum(testing_scores),
                             np.mean(testing_scores), avg_qvalue,
                             testing_frames, self.test_epsilon,
                             self.memory.count)
                log_test.write(','.join(map(str, test_data)) + NL)
                log_test.flush()

        log_train.close()
        log_test.close()
        log_train_scores.close()
        log_test_scores.close()
        log_weights.close()

Пример #8

Показать файл

Файл: main.py Проект: wqren/Replicating-DeepMind

class Main:
    # How many transitions to keep in memory?
    memory_size = 1000000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one frame
    frame_size = 84*84

    # How many frames form a history
    history_length = 4

    # Size of one state is four 84x84 screens
    state_size = history_length * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # How many frames to play to choose random frame
    init_frames = 1000
    
    # How many epochs to run
    epochs = 200

    # Number of frames to play during one training epoch
    training_frames = 50000

    # Number of frames to play during one testing epoch
    testing_frames = 10000

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        qvalues = self.nnet.predict(last_state)
        assert qvalues.shape[0] == 1
        assert qvalues.shape[1] == self.number_of_actions
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        assert post_qvalues.shape[0] == self.minibatch_size
        assert post_qvalues.shape[1] == self.number_of_actions

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis=1)
        assert max_qvalues.shape[0] == self.minibatch_size
        assert len(max_qvalues.shape) == 1

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        assert qvalues.shape[0] == self.minibatch_size
        assert qvalues.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())
        return cost

    def play_games(self, nr_frames, train, epsilon):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            reward = self.ale.move(action)
            if reward:
                print "    Got reward of %d!!!" % reward
                reward = 1
            game_score += reward
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Store new information to memory
            self.ale.store_step(action)

            # Only if training
            if train:
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Train neural net with random minibatch
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "   Game over!!! Score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self):
        # Play number of random games and pick random states to calculate Q-values for
        print "Playing %d games with random policy" % self.init_frames
        self.play_games(self.init_frames, False, 1)
        self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

        # Open log file and write header
        log_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w")
        log_file.write("epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n")

        for epoch in range(1, self.epochs + 1):
            print "Epoch %d:" % epoch
            # play number of frames with training and epsilon annealing
            print "  Training for %d frames" % self.training_frames
            self.play_games(self.training_frames, True, None)
            # play number of frames without training and without epsilon annealing
            print "  Testing for %d frames" % self.testing_frames
            game_scores = self.play_games(self.testing_frames, False, 0.05)

            # calculate Q-values 
            qvalues = self.nnet.predict(self.random_states)
            assert qvalues.shape[0] == self.nr_random_states
            assert qvalues.shape[1] == self.number_of_actions
            max_qvalues = np.max(qvalues, axis=1)
            assert max_qvalues.shape[0] == self.nr_random_states
            assert len(max_qvalues.shape) == 1
            avg_qvalue = np.mean(max_qvalues)

            # calculate average scores
            sum_score = sum(game_scores)
            nr_games = len(game_scores)
            avg_score = np.mean(game_scores)
            epsilon = self.compute_epsilon(self.total_frames_trained)
            
            # log average scores in file
            log_file.write("%d,%d,%f,%f,%d,%f,%d,%f,%d\n" % (epoch, nr_games, sum_score, avg_score, self.testing_frames, avg_qvalue, self.total_frames_trained, epsilon, self.memory.count))
            log_file.flush()

        log_file.close()

Пример #9

Показать файл

class Main:
    # How many transitions to keep in memory?
    memory_size = 100000

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    # Size of the mini-batch which will be sent to learning in Theano
    minibatch_size = None

    # Number of possible actions in a given game
    number_of_actions = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84],
                              filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2],
                              n_hidden=256,
                              n_out=self.number_of_actions)
        self.ale = ALE(self.memory)

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.9 - frames_played / self.memory_size, 0.1)

    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0

        # Play games until maximum number is reached
        while games_played < games_to_play:
            # Start a new game
            self.ale.new_game()

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))
                    print "chose randomly ", action

                # Usually neural net chooses the best action
                else:
                    print "chose by neural net"
                    action = self.nnet.predict_best_action(
                        [self.memory.get_last_state()])
                    print action

                # Make the move
                self.ale.move(action)

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session

                self.nnet.train(self.memory.get_minibatch(self.minibatch_size))

            # After "game over" increase the number of games played
            games_played += 1

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()