Exemplo n.º 1
0
    def __init__(self):
        """
		configure
		@param memory: store tuning information
		@param enviroment: implement enviroment----filter
		@param networks: AI
		"""
        # configure the enviroment
        self.myfilter = Filter()

        # configure the networks
        self.net_models = Nnet()

        # configure the memory
        self.data_memory = MemoryD(self.memory_size, self.myfilter.state_size,
                                   self.state_nbr)
        """
		random demostration for calculate the avg qvaluse
		"""
        random_screws = np.zeros((self.random_nr, 2), dtype=np.float64)
        for i in range(self.random_nr):
            random_screws[i] = ([
                random.uniform(self.myfilter.screw_min,
                               self.myfilter.screw_max),
                random.uniform(self.myfilter.screw_min,
                               self.myfilter.screw_max)
            ])

        # Fetch random states
        random_states = self.myfilter.new_tuning(random_screws)
        # dimensionality reduction
        self.dr_random_states = self.myfilter.dimreduction_pca(
            random_states.transpose())
Exemplo n.º 2
0
 def __init__(self):
     self.memory = MemoryD(self.memory_size)
     self.ale = ALE(display_screen="true",
                    skip_frames=4,
                    game_ROM='../libraries/ale/roms/breakout.bin')
     self.nnet = NeuralNet(self.state_size, self.number_of_actions,
                           "ai/deepmind-layers.cfg",
                           "ai/deepmind-params.cfg", "layer4")
Exemplo n.º 3
0
    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84],
                              filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2],
                              n_hidden=256,
                              n_out=self.number_of_actions)
        self.ale = ALE(self.memory)
Exemplo n.º 4
0
    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84], filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2], n_hidden=256, n_out=self.number_of_actions)
        self.ale = ALE(self.memory)
Exemplo n.º 5
0
    def __init__(self, game_name, run_id):

        self.number_of_actions = len(action_dict[game_name])
        valid_actions = action_dict[game_name]

        net.layers[-2] = dp.FullyConnected(n_output=self.number_of_actions,
                                           weights=dp.Parameter(
                                               dp.NormalFiller(sigma=0.1),
                                               weight_decay=0.004,
                                               monitor=False))

        self.memory = MemoryD(self.memory_size)

        self.ale = ALE(valid_actions,
                       run_id,
                       display_screen="false",
                       skip_frames=4,
                       game_ROM='ale/roms/' + game_name + '.bin')

        self.nnet = net
        self.q_values = []
        self.test_game_scores = []
Exemplo n.º 6
0
    def __init__(self, game_name, run_id):

        self.number_of_actions = len(action_dict[game_name])
        valid_actions = action_dict[game_name]

        net.layers[-2] = dp.FullyConnected(n_output=self.number_of_actions,
            weights=dp.Parameter(dp.NormalFiller(sigma=0.1),
                                 weight_decay=0.004, 
                                 monitor=False))

        self.memory = MemoryD(self.memory_size)

        self.ale = ALE(valid_actions, 
            run_id, display_screen="false", 
            skip_frames=4, 
            game_ROM='ale/roms/'+game_name+'.bin')

        self.nnet = net
        self.q_values = []
        self.test_game_scores = []
Exemplo n.º 7
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 500000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 6 for "Breakout"
    number_of_actions = 6

    # Size of one frame
    frame_size = 84*84

    # Size of one state is four 84x84 screens
    state_size = 4 * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Epsilon during testing
    test_epsilon = 0.05

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)

        # use neural net to predict Q-values for all actions
        qvalues = self.nnet.predict(last_state)
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        #print "Prestate q-values: ", qvalues[0,:]
        #print "Action was: %d, reward was %d" % (actions[0], rewards[0])

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        #print "Poststate q-values: ", post_qvalues[0,:]

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis = 1)

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]
        #print "Corrected q-values: ", qvalues[0,:]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())

        #qvalues = self.nnet.predict(prestates)
        #print "After training: ", qvalues[0,:]

        return cost

    def play_games(self, nr_frames, train, epsilon = None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """
        assert train or epsilon is not None

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            points = self.ale.move(action)
            if points > 0:
                print "    Got %d points" % points
            game_score += points
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Only if training
            if train:
                # Store new information to memory
                self.ale.store_step(action)
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Fetch random minibatch from memory
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                # Train neural net with the minibatch
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):
        # Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        log_train = open("../log/training_" + timestamp + ".csv", "w")
        log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n")
        log_test = open("../log/testing_" + timestamp + ".csv", "w")
        log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n")
        log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w")
        log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w")
        log_weights = open("../log/weights_" + timestamp + ".csv", "w")

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch

            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, train = True)

                # log training scores
                log_train_scores.write(NL.join(map(str, training_scores)) + NL)
                log_train_scores.flush()

                # log aggregated training data
                train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count)
                log_train.write(','.join(map(str, train_data)) + NL)
                log_train.flush()

                weights = self.nnet.get_weight_stats()
                if epoch == 1:
                    # write header
                    wlayers = []
                    for (layer, index) in weights:
                        wlayers.extend([layer, index, ''])
                    log_weights.write(','.join(wlayers) + NL)
                    wlabels = []
                    for (layer, index) in weights:
                        wlabels.extend(['weights', 'weightsInc', 'incRatio'])
                    log_weights.write(','.join(wlabels) + NL)
                wdata = []
                for w in weights.itervalues():
                    wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)])
                log_weights.write(','.join(wdata) + NL)
                log_weights.flush()

                # save network state
                self.nnet.save_network(epoch)
                print   # save_network()'s output doesn't include newline

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon)

                # log testing scores
                log_test_scores.write(NL.join(map(str, testing_scores)) + NL)
                log_test_scores.flush()

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

                # Do not calculate Q-values when mamory is empty
                if self.random_states is not None:
                    # calculate Q-values 
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis = 1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0

                # log aggregated testing data
                test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count)
                log_test.write(','.join(map(str, test_data)) + NL)
                log_test.flush()

        log_train.close()
        log_test.close()
        log_train_scores.close()
        log_test_scores.close()
        log_weights.close()
Exemplo n.º 8
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 100000

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    # Size of the mini-batch which will be sent to learning in Theano
    minibatch_size = None

    # Number of possible actions in a given game
    number_of_actions = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84], filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2], n_hidden=256, n_out=self.number_of_actions)
        self.ale = ALE(self.memory)

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.9 - frames_played / (self.memory_size * 1.0), 0.1)


    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0

        # Play games until maximum number is reached
        while games_played < games_to_play:
            # Start a new game
            self.ale.new_game()

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)
                #print "espilon is", epsilon
                # Before AI takes an action we must make sure it is safe for the human race
                if   injury_to_a_human_being    is not None:
                    raise Exception('The First Law of Robotics is violated!')
                elif conflict_with_orders_given is not None:
                    raise Exception('The Second Law of Robotics is violated!')
                elif threat_to_my_existence     is not None:
                    raise Exception('The Third Law of Robotics is violated!')

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))
                    #print "chose randomly ", action

                # Usually neural net chooses the best action
                else:
                    #print "chose by neural net"
                    action = self.nnet.predict_best_action([self.memory.get_last_state()])
                    print action

                # Make the move
                self.ale.move(action)

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session

                self.nnet.train(self.memory.get_minibatch(self.minibatch_size))
                frames_played += 1
            # After "game over" increase the number of games played
            games_played += 1

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()
Exemplo n.º 9
0
 def __init__(self):
     self.memory = MemoryD(self.memory_size)
     self.ale = ALE(self.memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin')
     self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")
Exemplo n.º 10
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 500000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 6 for "Breakout"
    number_of_actions = 6

    # Size of one frame
    frame_size = 84*84

    # Size of one state is four 84x84 screens
    state_size = 4 * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Epsilon during testing
    test_epsilon = 0.05

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory, display_screen="true", skip_frames=4, game_ROM='../libraries/ale/roms/breakout.bin')
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)

        # use neural net to predict Q-values for all actions
        qvalues = self.nnet.predict(last_state)
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        #print "Prestate q-values: ", qvalues[0,:]
        #print "Action was: %d, reward was %d" % (actions[0], rewards[0])

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        #print "Poststate q-values: ", post_qvalues[0,:]

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis = 1)

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]
        #print "Corrected q-values: ", qvalues[0,:]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())

        #qvalues = self.nnet.predict(prestates)
        #print "After training: ", qvalues[0,:]

        return cost

    def play_games(self, nr_frames, train, epsilon = None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """
        assert train or epsilon is not None

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            points = self.ale.move(action)
            if points > 0:
                print "    Got %d points" % points
            game_score += points
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Only if training
            if train:
                # Store new information to memory
                self.ale.store_step(action)
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Fetch random minibatch from memory
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                # Train neural net with the minibatch
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):
        # Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        log_train = open("../log/training_" + timestamp + ".csv", "w")
        log_train.write("epoch,nr_games,sum_score,average_score,nr_frames,total_frames_trained,epsilon,memory_size\n")
        log_test = open("../log/testing_" + timestamp + ".csv", "w")
        log_test.write("epoch,nr_games,sum_score,average_score,average_qvalue,nr_frames,epsilon,memory_size\n")
        log_train_scores = open("../log/training_scores_" + timestamp + ".txt", "w")
        log_test_scores = open("../log/testing_scores_" + timestamp + ".txt", "w")
        log_weights = open("../log/weights_" + timestamp + ".csv", "w")

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch

            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, train = True)

                # log training scores
                log_train_scores.write(NL.join(map(str, training_scores)) + NL)
                log_train_scores.flush()

                # log aggregated training data
                train_data = (epoch, len(training_scores), sum(training_scores), np.mean(training_scores), training_frames, self.total_frames_trained, self.compute_epsilon(self.total_frames_trained), self.memory.count)
                log_train.write(','.join(map(str, train_data)) + NL)
                log_train.flush()

                weights = self.nnet.get_weight_stats()
                if epoch == 1:
                    # write header
                    wlayers = []
                    for (layer, index) in weights:
                        wlayers.extend([layer, index, ''])
                    log_weights.write(','.join(wlayers) + NL)
                    wlabels = []
                    for (layer, index) in weights:
                        wlabels.extend(['weights', 'weightsInc', 'incRatio'])
                    log_weights.write(','.join(wlabels) + NL)
                wdata = []
                for w in weights.itervalues():
                    wdata.extend([str(w[0]), str(w[1]), str(w[1] / w[0] if w[0] > 0 else 0)])
                log_weights.write(','.join(wdata) + NL)
                log_weights.flush()

                # save network state
                self.nnet.save_network(epoch)
                print   # save_network()'s output doesn't include newline

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                testing_scores = self.play_games(testing_frames, train = False, epsilon = self.test_epsilon)

                # log testing scores
                log_test_scores.write(NL.join(map(str, testing_scores)) + NL)
                log_test_scores.flush()

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

                # Do not calculate Q-values when mamory is empty
                if self.random_states is not None:
                    # calculate Q-values 
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis = 1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0

                # log aggregated testing data
                test_data = (epoch, len(testing_scores), sum(testing_scores), np.mean(testing_scores), avg_qvalue, testing_frames, self.test_epsilon, self.memory.count)
                log_test.write(','.join(map(str, test_data)) + NL)
                log_test.flush()

        log_train.close()
        log_test.close()
        log_train_scores.close()
        log_test_scores.close()
        log_weights.close()
Exemplo n.º 11
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 300000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one state is four 84x84 screens
    state_size = 4*84*84

    # Discount factor for future rewards
    discount_factor = 0.9

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / (1000000 * 1.0), 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        scores = self.nnet.predict(last_state)
        assert scores.shape[1] == self.number_of_actions

        self.output_file.write(str(scores).strip().replace(' ', ',')[2:-2] + '\n')
        self.output_file.flush()
        
        # return action (index) with maximum score
        return np.argmax(scores)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict scores for poststates
        post_scores = self.nnet.predict(poststates)
        assert post_scores.shape[0] == self.minibatch_size
        assert post_scores.shape[1] == self.number_of_actions

        # take maximum score of all actions
        max_scores = np.max(post_scores, axis=1)
        assert max_scores.shape[0] == self.minibatch_size
        assert len(max_scores.shape) == 1

        # predict scores for prestates, so we can keep scores for other actions unchanged
        scores = self.nnet.predict(prestates)
        assert scores.shape[0] == self.minibatch_size
        assert scores.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            scores[i][action] = rewards[i] + self.discount_factor * max_scores[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, scores.transpose().copy())
        return cost

    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0
        game_scores = []
        scores_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".txt", "w")
        self.output_file = open("../log/Q_history"+time.strftime("%Y-%m-%d-%H-%M")+".csv","w")

        # Play games until maximum number is reached
        while games_played < games_to_play:

            # Start a new game
            self.ale.new_game()
            print "starting game", games_played+1, "frames played so far:", frames_played
            game_score = 0
            self.nnet.epoch = games_played

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)

                # Before AI takes an action we must make sure it is safe for the human race
                if   injury_to_a_human_being    is not None:
                    raise Exception('The First Law of Robotics is violated!')
                elif conflict_with_orders_given is not None:
                    raise Exception('The Second Law of Robotics is violated!')
                elif threat_to_my_existence     is not None:
                    raise Exception('The Third Law of Robotics is violated!')

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))

                # Usually neural net chooses the best action
                else:
                    action = self.predict_best_action(self.memory.get_last_state())

                # Make the move
                reward = self.ale.move(action)
                game_score += reward

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                frames_played += 1

            # After "game over" increase the number of games played
            games_played += 1
            
            # Store game state every 100 games
            if games_played % 100 == 0:

                # Store state of the network as cpickle as Convnet does
                self.nnet.sync_with_host()
                self.nnet.save_state()
            
                # Store the weights and biases of all layers
                layers_list=["layer1","layer2","layer3","layer4"]
                layer_dict = {}
                for layer_name in layers_list:
                    w = m.nnet.layers[layer_name]["weights"][0].copy()
                    b = m.nnet.layers[layer_name]["biases"][0].copy()
                    layer_dict[layer_name] = {'weights': w, 'biases': b}
                filename = "../log/weights_at_" + str(games_played) + "_games.pkl"
                weights_file = open(filename, "wb")
                cPickle.dump(layer_dict, weights_file)
                weights_file.close()

            # write the game score to a file 
            scores_file.write(str(game_score)+"\n")
            scores_file.flush()

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()

        print game_scores
        scores_file.close()
Exemplo n.º 12
0
class Main(object):
    """
    Main class for starting training and testing
   """

    memory_size = 500000
    memory = None

    minibatch_size = 32

    frame_size = 84 * 84

    state_length = 4
    state_size = state_length * frame_size

    discount_factor = 0.9
    epsilon_frames = 1000000.0
    test_epsilon = 0.05

    total_frames_trained = 0
    nr_random_states = 100
    random_states = None

    nnet = None
    ale = None

    current_state = None

    def __init__(self, game_name, run_id):

        self.number_of_actions = len(action_dict[game_name])
        valid_actions = action_dict[game_name]

        net.layers[-2] = dp.FullyConnected(n_output=self.number_of_actions,
                                           weights=dp.Parameter(
                                               dp.NormalFiller(sigma=0.1),
                                               weight_decay=0.004,
                                               monitor=False))

        self.memory = MemoryD(self.memory_size)

        self.ale = ALE(valid_actions,
                       run_id,
                       display_screen="false",
                       skip_frames=4,
                       game_ROM='ale/roms/' + game_name + '.bin')

        self.nnet = net
        self.q_values = []
        self.test_game_scores = []

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.99 - frames_played / self.epsilon_frames, 0.1)

    def predict_action(self, last_state, train):
        '''use neural net to predict Q-values for all actions
        return action (index) with maximum Q-value'''

        qvalues = self.nnet.predict(last_state)
        if not train: self.q_values.append(np.max(qvalues))

        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch
        prestates = dp.Input(prestates)

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis=1)

        # update the Q-values for the actions we actually performed
        # remember delta value for prioritized sweeping
        for i, action in enumerate(actions):
            qvalues[i][
                action] = rewards[i] + self.discount_factor * max_qvalues[i]

        train_input = dp.SupervisedInput(prestates.x,
                                         qvalues,
                                         batch_size=self.minibatch_size)

        self.trainer.train(net, train_input)

    def play_games(self, nr_frames, epoch, train, epsilon=None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        first_frame = self.ale.new_game()
        if train: self.memory.add_first(first_frame)

        if self.current_state == None:
            self.current_state = np.empty((1, self.state_length, 84, 84),
                                          dtype=np.float64)
            for i in range(self.state_length):
                self.current_state[0, i, :, :] = first_frame.copy()
        else:
            self.current_state.x[0, :-1, :, :] = self.current_state.x[0,
                                                                      1:, :, :]
            self.current_state.x[0, -1, :, :] = first_frame.copy()

        game_score = 0

        if train and epoch == 1:
            self.current_state = dp.Input(self.current_state)
            self.current_state.y_shape = (1, self.number_of_actions)
            self.nnet._setup(self.current_state)

        while frames_played < nr_frames:
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)

            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
            else:
                action = self.predict_action(self.current_state, train)

            points, next_frame = self.ale.move(action)

            # Changing points to rewards
            if points > 0:
                print "    Got %d points" % points
                reward = 1
            elif points < 0:
                print "    Lost %d points" % points
                reward = -1
            else:
                reward = 0

            game_score += points
            frames_played += 1

            self.current_state.x[0, :-1, :, :] = self.current_state.x[0,
                                                                      1:, :, :]
            self.current_state.x[0, -1, :, :] = next_frame

            if train:
                self.memory.add(action, reward, next_frame)
                self.total_frames_trained += 1
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)

            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score)
                game_score = 0

                # And do stuff after end game
                self.ale.end_game()
                if train: self.memory.add_last()

                first_frame = self.ale.new_game()
                if train: self.memory.add_first(first_frame)

                self.current_state.x[0, :-1, :, :] = self.current_state.x[
                    0, 1:, :, :]
                self.current_state.x[0, -1, :, :] = first_frame.copy()

        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch
            learn_rate = 0.0001 * 1 / float(epoch)
            self.trainer = dp.StochasticGradientDescent(
                max_epochs=1,
                learn_rule=dp.RMSProp(learn_rate=learn_rate,
                                      decay=0.9,
                                      max_scaling=1e3),
            )
            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames,
                                                  epoch,
                                                  train=True)

            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                self.test_game_scores.append(
                    self.play_games(testing_frames,
                                    epoch,
                                    train=False,
                                    epsilon=self.test_epsilon))

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(
                        self.nr_random_states)[0]

                # Do not calculate Q-values when memory is empty
                if self.random_states is not None:
                    # calculate Q-values
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis=1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0
Exemplo n.º 13
0
class Main(object):
    """
    Main class for starting training and testing
   """

    memory_size = 500000
    memory = None

    minibatch_size = 32

    frame_size = 84*84

    state_length = 4 
    state_size = state_length * frame_size

    discount_factor = 0.9
    epsilon_frames = 1000000.0
    test_epsilon = 0.05

    total_frames_trained = 0
    nr_random_states = 100
    random_states = None

    nnet = None
    ale = None

    current_state = None    

    def __init__(self, game_name, run_id):

        self.number_of_actions = len(action_dict[game_name])
        valid_actions = action_dict[game_name]

        net.layers[-2] = dp.FullyConnected(n_output=self.number_of_actions,
            weights=dp.Parameter(dp.NormalFiller(sigma=0.1),
                                 weight_decay=0.004, 
                                 monitor=False))

        self.memory = MemoryD(self.memory_size)

        self.ale = ALE(valid_actions, 
            run_id, display_screen="false", 
            skip_frames=4, 
            game_ROM='ale/roms/'+game_name+'.bin')

        self.nnet = net
        self.q_values = []
        self.test_game_scores = []

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.99 - frames_played / self.epsilon_frames, 0.1)

    def predict_action(self, last_state, train):
        '''use neural net to predict Q-values for all actions
        return action (index) with maximum Q-value'''

        qvalues = self.nnet.predict(last_state)
        if not train: self.q_values.append(np.max(qvalues))

        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates, actions, rewards, poststates = minibatch
        prestates = dp.Input(prestates)

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis = 1)

        # update the Q-values for the actions we actually performed
        # remember delta value for prioritized sweeping
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]

        train_input = dp.SupervisedInput(prestates.x, qvalues, batch_size=self.minibatch_size)

        self.trainer.train(net, train_input)
        
    def play_games(self, nr_frames, epoch, train, epsilon = None):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        first_frame = self.ale.new_game()
        if train: self.memory.add_first(first_frame)

        if self.current_state == None:
            self.current_state = np.empty((1, self.state_length, 84, 84), dtype=np.float64)
            for i in range(self.state_length):
                self.current_state[0, i, :, :] = first_frame.copy()
        else:
            self.current_state.x[0, :-1, :, :] = self.current_state.x[0, 1:, :, :] 
            self.current_state.x[0, -1, :, :] = first_frame.copy()

        game_score = 0

        if train and epoch == 1:
            self.current_state = dp.Input(self.current_state)
            self.current_state.y_shape = (1,self.number_of_actions)
            self.nnet._setup(self.current_state)

        while frames_played < nr_frames:
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)

            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
            else:
                action = self.predict_action(self.current_state, train)

            points, next_frame = self.ale.move(action)

            # Changing points to rewards
            if points > 0:
                print "    Got %d points" % points
                reward = 1
            elif points < 0:
                print "    Lost %d points" % points
                reward = -1
            else:
                reward = 0

            game_score += points
            frames_played += 1

            self.current_state.x[0, :-1, :, :] = self.current_state.x[0, 1:,:,:] 
            self.current_state.x[0, -1, :, :] = next_frame

            if train:
                self.memory.add(action, reward, next_frame)
                self.total_frames_trained += 1
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)

            if self.ale.game_over:
                print "    Game over, score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score)
                game_score = 0

                # And do stuff after end game
                self.ale.end_game()
                if train: self.memory.add_last()

                first_frame = self.ale.new_game()
                if train: self.memory.add_first(first_frame)

                self.current_state.x[0, :-1, :, :] = self.current_state.x[0, 1:,:,:] 
                self.current_state.x[0, -1, :, :] = first_frame.copy()

        self.ale.end_game()

        return game_scores

    def run(self, epochs, training_frames, testing_frames):

        for epoch in range(1, epochs + 1):
            print "Epoch %d:" % epoch
            learn_rate = 0.0001*1/float(epoch)
            self.trainer =dp.StochasticGradientDescent(
                max_epochs=1,
                learn_rule=dp.RMSProp(learn_rate=learn_rate, decay=0.9, max_scaling=1e3),
            )
            if training_frames > 0:
                # play number of frames with training and epsilon annealing
                print "  Training for %d frames" % training_frames
                training_scores = self.play_games(training_frames, epoch, train = True)


            if testing_frames > 0:
                # play number of frames without training and without epsilon annealing
                print "  Testing for %d frames" % testing_frames
                self.test_game_scores.append(self.play_games(testing_frames, epoch, train = False, epsilon = self.test_epsilon))

                # Pick random states to calculate Q-values for
                if self.random_states is None and self.memory.count > self.nr_random_states:
                    print "  Picking %d random states for Q-values" % self.nr_random_states
                    self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

                # Do not calculate Q-values when memory is empty
                if self.random_states is not None:
                    # calculate Q-values 
                    qvalues = self.nnet.predict(self.random_states)
                    assert qvalues.shape[0] == self.nr_random_states
                    assert qvalues.shape[1] == self.number_of_actions
                    max_qvalues = np.max(qvalues, axis = 1)
                    assert max_qvalues.shape[0] == self.nr_random_states
                    assert len(max_qvalues.shape) == 1
                    avg_qvalue = np.mean(max_qvalues)
                else:
                    avg_qvalue = 0
Exemplo n.º 14
0
 def setUp(self):
     self.memory = MemoryD(10)
Exemplo n.º 15
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 1000000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one frame
    frame_size = 84*84

    # How many frames form a history
    history_length = 4

    # Size of one state is four 84x84 screens
    state_size = history_length * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # How many frames to play to choose random frame
    init_frames = 1000
    
    # How many epochs to run
    epochs = 200

    # Number of frames to play during one training epoch
    training_frames = 50000

    # Number of frames to play during one testing epoch
    testing_frames = 10000

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        qvalues = self.nnet.predict(last_state)
        assert qvalues.shape[0] == 1
        assert qvalues.shape[1] == self.number_of_actions
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        assert post_qvalues.shape[0] == self.minibatch_size
        assert post_qvalues.shape[1] == self.number_of_actions

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis=1)
        assert max_qvalues.shape[0] == self.minibatch_size
        assert len(max_qvalues.shape) == 1

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        assert qvalues.shape[0] == self.minibatch_size
        assert qvalues.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][action] = rewards[i] + self.discount_factor * max_qvalues[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())
        return cost

    def play_games(self, nr_frames, train, epsilon):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            reward = self.ale.move(action)
            if reward:
                print "    Got reward of %d!!!" % reward
                reward = 1
            game_score += reward
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Store new information to memory
            self.ale.store_step(action)

            # Only if training
            if train:
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Train neural net with random minibatch
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "   Game over!!! Score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score);
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self):
        # Play number of random games and pick random states to calculate Q-values for
        print "Playing %d games with random policy" % self.init_frames
        self.play_games(self.init_frames, False, 1)
        self.random_states = self.memory.get_minibatch(self.nr_random_states)[0]

        # Open log file and write header
        log_file = open("../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w")
        log_file.write("epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n")

        for epoch in range(1, self.epochs + 1):
            print "Epoch %d:" % epoch
            # play number of frames with training and epsilon annealing
            print "  Training for %d frames" % self.training_frames
            self.play_games(self.training_frames, True, None)
            # play number of frames without training and without epsilon annealing
            print "  Testing for %d frames" % self.testing_frames
            game_scores = self.play_games(self.testing_frames, False, 0.05)

            # calculate Q-values 
            qvalues = self.nnet.predict(self.random_states)
            assert qvalues.shape[0] == self.nr_random_states
            assert qvalues.shape[1] == self.number_of_actions
            max_qvalues = np.max(qvalues, axis=1)
            assert max_qvalues.shape[0] == self.nr_random_states
            assert len(max_qvalues.shape) == 1
            avg_qvalue = np.mean(max_qvalues)

            # calculate average scores
            sum_score = sum(game_scores)
            nr_games = len(game_scores)
            avg_score = np.mean(game_scores)
            epsilon = self.compute_epsilon(self.total_frames_trained)
            
            # log average scores in file
            log_file.write("%d,%d,%f,%f,%d,%f,%d,%f,%d\n" % (epoch, nr_games, sum_score, avg_score, self.testing_frames, avg_qvalue, self.total_frames_trained, epsilon, self.memory.count))
            log_file.flush()

        log_file.close()
Exemplo n.º 16
0
class Main:
    #-------Initialization Parameters
    #--Memory Parameters
    memory_size = 50000  # Home many data keep in memory

    total_states_nbr = 0  # Total states tuned, only incremented during training

    data_memory = None  # include: state, reward, action, count

    #--Networks Parameters
    minibatch_size = 50  # Size of the minibatch

    test_epsilon = 0.01  # Epsilon during testing

    discount_factor = 0.99  # Discount factor for future rewards

    epsilon_total_nbr = 20000  # Exploration rate annealing speed

    net_models = None  # networks models: BPNN

    #train_net = None # book the networks which tuning reachedtrain_success = 0
    train_success = 0
    #--Enviroment Parameters
    state_nbr = 1

    tuned_reached = False

    current_state = None

    myfilter = None

    #--Others testing demonstration --random and fixed
    random_nr = 100  # Number of random sample to use for training
    random_demo = []

    def __init__(self):
        """
		configure
		@param memory: store tuning information
		@param enviroment: implement enviroment----filter
		@param networks: AI
		"""
        # configure the enviroment
        self.myfilter = Filter()

        # configure the networks
        self.net_models = Nnet()

        # configure the memory
        self.data_memory = MemoryD(self.memory_size, self.myfilter.state_size,
                                   self.state_nbr)
        """
		random demostration for calculate the avg qvaluse
		"""
        random_screws = np.zeros((self.random_nr, 2), dtype=np.float64)
        for i in range(self.random_nr):
            random_screws[i] = ([
                random.uniform(self.myfilter.screw_min,
                               self.myfilter.screw_max),
                random.uniform(self.myfilter.screw_min,
                               self.myfilter.screw_max)
            ])

        # Fetch random states
        random_states = self.myfilter.new_tuning(random_screws)
        # dimensionality reduction
        self.dr_random_states = self.myfilter.dimreduction_pca(
            random_states.transpose())

    def compute_epsilon(self, epsilon_nbr, state_tuned):
        """
		From the paper: "The behavior policy during training was epsilon-greedy
		with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
		@param states_tuned: How far are we with our learning
		"""
        return max(0.99 - state_tuned / epsilon_nbr, 0.1)

    def compute_reward(self, cost_old, cost):

        # Changing rewards
        if cost < cost_old:
            return 1
        else:
            return -1

    def predict_best_action(self, state):
        # Dimensionality reduction
        dr_state = self.myfilter.dimreduction_pca(state.transpose())

        # use networks to predict q-values for all actions
        qvalues = self.net_models.nnet.predict(dr_state)

        # return action index with maximum Q-values
        return np.argmax(qvalues)

    def tuning_filter(self, tuning_steps, training, epsilon=None):
        """
		start a tuning
		@param tuning_steps: total steps of the filter allowed to tune
		@param train: true or false, whether to training 
		"""

        # initialization some parameters

        tuning_cost = 0  #
        tuning_cost_old = 0

        tuning_finished = False  # when tuning reached, set True
        tuned_steps = []  # when tuning reached, save tuned steps

        screws_position = np.zeros((1, 2), dtype=np.float64)

        #-----------start a new tuning, get current state
        current_state = self.myfilter.new_tuning(screws_position)

        # If training we add to memory, else pass
        if training:
            self.data_memory.add_first(current_state)
            nnet_now = []
        else:
            pass

        # Check tuning reached
        tuning_finished, tuning_cost = self.myfilter.tuning_check(
            current_state)
        tuning_cost_old = tuning_cost

        #-----------loop untill reached or over steps limit
        tuning_count = 0  # use to count the total tuning steps of this cycle
        tuning_step = 0  # use to count the tuning steps, when tuning is finished, clear it.

        while tuning_count < tuning_steps:

            # ---Epsilon decrease over time only when training
            if training:
                epsilon = self.compute_epsilon(self.epsilon_total_nbr,
                                               self.total_states_nbr)

            # ---Predict action
            # Some time random action
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.myfilter.actions_nbr))

            # Usually Net chooses the best action
            else:
                action = self.predict_best_action(self.current_state)

#			print "actions %d " % action

# tuning, return screws position and the new state
            screws_position, next_state, tuning_finished, tuning_cost = self.myfilter.tuning(
                action, screws_position)

            # Calculate Rewards

            reward = self.compute_reward(tuning_cost_old, tuning_cost)

            # Book keeping
            tuning_count += 1
            tuning_step += 1
            self.current_state = next_state
            tuning_cost_old = tuning_cost

            # display current state
            plt.subplot(1, 2, 1)
            self.myfilter.plot_state(next_state)
            plt.text(20, -25, ('Action : ' + str(action)))
            plt.text(20, -28, ('screw position: ' + str(screws_position[0])))
            plt.text(20, -31, ('Cost: ' + str(tuning_cost)))
            plt.text(20, -34, ('Now Count : ' + str(tuning_count - 1)))
            plt.text(20, -37, ('Total Count : ' + str(self.data_memory.count)))
            plt.hold(False)
            plt.hold(False)
            plt.show()
            plt.draw()
            plt.pause(0.000001)

            #---If training
            if training:
                # Increase total state number
                self.total_states_nbr += 1

                # Store new information to memory
                self.data_memory.add(action, reward, next_state)

                # Fetch random minibatch from memory
                minibatch = self.data_memory.get_minibatch(self.minibatch_size)

                # Train net with the minibatch
                nnet_now = self.net_models.net_train(minibatch, self.myfilter,
                                                     self.discount_factor)

            # When tuning reached, if tuning count is not reached limit, tuning continue.
            if tuning_finished:
                print " Tuning finished, steps = %d " % tuning_step

                # Book tuning count
                tuned_steps.append(tuning_step)

                # Book current networks , save to json
                if training:
                    self.train_success += 1
                    nnet_now.save('log/modelsaved/' + str(self.train_success) +
                                  '_model.h5')
                    #self.train_net.append(nnet_now)

                # Clear tuning step, finishe flag, anc cost.
                tuning_step = 0
                tuning_finished = False

                # Start New tuning
                screws_position = np.zeros((1, 2), dtype=np.float64)

                #-----------start a new tuning, get current state
                current_state = self.myfilter.new_tuning(screws_position)

                # If training we add to memory, else pass
                if training:
                    self.data_memory.add_first(current_state)
                else:
                    pass

        # avoid null information
        if len(tuned_steps) == 0:
            tuned_steps.append(tuning_steps)

        #return tuned steps
        if training:
            #return tuned_steps, self.train_net
            return tuned_steps, self.train_success
        else:
            return tuned_steps

    def run(self, epochs, training_steps, testing_steps):
        # -------------Open log files and write headers
        timestamp = time.strftime("%Y-%m-%d-%H-%M")
        if training_steps > 0:
            # training log file open
            log_training = open("log/training_" + timestamp + ".txt", "w")
            # training log file writes header
            log_training.write(
                "epoch, training tuned steps, total training steps, epsilon, momery count"
            )

        if testing_steps > 0:
            # testing log file open
            log_testing = open("log/testing" + timestamp + ".txt", "w")
            # testing log file writes header
            log_testing.write(
                "epoch, testing tuned steps, avg qvaule, epsilon, momory count"
            )

        plt.figure(1)
        avg_qvalues = []
        training_tuned_steps = []
        testing_tuned_steps = []
        # --------start loop------------
        for epoch in range(1, epochs + 1):
            # print epoch now
            print "Epoch %d:   " % epoch

            #---------------training
            if training_steps > 0:
                print "Training for %d steps" % training_steps
                # tuning filter
                #tuned_steps, train_net = self.tuning_filter(training_steps, training = True)
                tuned_steps, train_success = self.tuning_filter(training_steps,
                                                                training=True)

                # save training log
                # epoch, training steps, avg qvlues, epsilon, memory count
                log_training.write(','.join(
                    map(str, (epoch, tuned_steps, self.total_states_nbr,
                              self.compute_epsilon(self.epsilon_total_nbr,
                                                   self.total_states_nbr),
                              self.data_memory.count))) + NL)
                log_training.flush()

                # use random state to calculate avg qvalues
                random_qvalues = self.net_models.nnet.predict(
                    self.dr_random_states)
                avg_qvalue = np.mean(np.max(random_qvalues, axis=1))
                print " Avg Q Value : %d  " % avg_qvalue
                avg_qvalues.append(avg_qvalue)
                plt.subplot(3, 2, 2)
                plt.plot(avg_qvalues)
                plt.xlabel('Epochs')
                plt.ylabel('Q-values')
                plt.hold(False)
                plt.show()
                plt.draw()
                plt.pause(0.000001)

                # show training tuned steps
                training_tuned_steps.append(tuned_steps)
                plt.subplot(3, 2, 4)
                plt.plot(training_tuned_steps)
                plt.xlabel('Epochs')
                plt.ylabel('Trianing Steps')
                plt.hold(False)
                plt.show()
                plt.draw()
                plt.pause(0.000001)

            #---------------testing
            if testing_steps > 0:
                print "Testing for %d steps" % testing_steps

                # tuning filter
                tuned_steps = self.tuning_filter(testing_steps,
                                                 training=False,
                                                 epsilon=self.test_epsilon)

                # save testing log
                # epoch, testing steps, avg qvalues, epsilon, memory count
                log_testing.write(','.join(
                    map(str, (epoch, tuned_steps, avg_qvalue,
                              self.test_epsilon, self.data_memory.count))) +
                                  NL)
                log_testing.flush()

                # show testing tuned steps
                testing_tuned_steps.append(tuned_steps)
                plt.subplot(3, 2, 6)
                plt.plot(testing_tuned_steps)
                plt.xlabel('Epochs')
                plt.ylabel('Testing Steps')
                plt.hold(False)
                plt.show()
                plt.draw()
                plt.pause(0.000001)

        # close log files
        if training_steps > 0:
            log_training.close()

        if testing_steps > 0:
            log_testing.close()

        # return data
        #return train_net
        return train_success
Exemplo n.º 17
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 1000000

    # Size of the mini-batch, 32 was given in the paper
    minibatch_size = 32

    # Number of possible actions in a given game, 4 for "Breakout"
    number_of_actions = 4

    # Size of one frame
    frame_size = 84 * 84

    # How many frames form a history
    history_length = 4

    # Size of one state is four 84x84 screens
    state_size = history_length * frame_size

    # Discount factor for future rewards
    discount_factor = 0.9

    # How many frames to play to choose random frame
    init_frames = 1000

    # How many epochs to run
    epochs = 200

    # Number of frames to play during one training epoch
    training_frames = 50000

    # Number of frames to play during one testing epoch
    testing_frames = 10000

    # Exploration rate annealing speed
    epsilon_frames = 1000000.0

    # Total frames played, only incremented during training
    total_frames_trained = 0

    # Number of random states to use for calculating Q-values
    nr_random_states = 100

    # Random states that we use to calculate Q-values
    random_states = None

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.ale = ALE(self.memory)
        self.nnet = NeuralNet(self.state_size, self.number_of_actions,
                              "ai/deepmind-layers.cfg",
                              "ai/deepmind-params.cfg", "layer4")

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(1.0 - frames_played / self.epsilon_frames, 0.1)

    def predict_best_action(self, last_state):
        assert last_state.shape[0] == self.state_size
        assert len(last_state.shape) == 1

        # last_state contains only one state, so we have to convert it into batch of size 1
        last_state.shape = (last_state.shape[0], 1)
        qvalues = self.nnet.predict(last_state)
        assert qvalues.shape[0] == 1
        assert qvalues.shape[1] == self.number_of_actions
        #print "Predicted action Q-values: ", qvalues

        # return action (index) with maximum Q-value
        return np.argmax(qvalues)

    def train_minibatch(self, minibatch):
        """
        Train function that transforms (state,action,reward,state) into (input, expected_output) for neural net
        and trains the network
        @param minibatch: list of arrays: prestates, actions, rewards, poststates
        """
        prestates = minibatch[0]
        actions = minibatch[1]
        rewards = minibatch[2]
        poststates = minibatch[3]

        assert prestates.shape[0] == self.state_size
        assert prestates.shape[1] == self.minibatch_size
        assert poststates.shape[0] == self.state_size
        assert poststates.shape[1] == self.minibatch_size
        assert actions.shape[0] == self.minibatch_size
        assert rewards.shape[0] == self.minibatch_size

        # predict Q-values for poststates
        post_qvalues = self.nnet.predict(poststates)
        assert post_qvalues.shape[0] == self.minibatch_size
        assert post_qvalues.shape[1] == self.number_of_actions

        # take maximum Q-value of all actions
        max_qvalues = np.max(post_qvalues, axis=1)
        assert max_qvalues.shape[0] == self.minibatch_size
        assert len(max_qvalues.shape) == 1

        # predict Q-values for prestates, so we can keep Q-values for other actions unchanged
        qvalues = self.nnet.predict(prestates)
        assert qvalues.shape[0] == self.minibatch_size
        assert qvalues.shape[1] == self.number_of_actions

        # update the Q-values for the actions we actually performed
        for i, action in enumerate(actions):
            qvalues[i][
                action] = rewards[i] + self.discount_factor * max_qvalues[i]

        # we have to transpose prediction result, as train expects input in opposite order
        cost = self.nnet.train(prestates, qvalues.transpose().copy())
        return cost

    def play_games(self, nr_frames, train, epsilon):
        """
        Main cycle: starts a game and plays number of frames.
        @param nr_frames: total number of games allowed to play
        @param train: true or false, whether to do training or not
        @param epsilon: fixed epsilon, only used when not training
        """

        frames_played = 0
        game_scores = []

        # Start a new game
        self.ale.new_game()
        game_score = 0

        # Play games until maximum number is reached
        while frames_played < nr_frames:

            # Epsilon decreases over time only when training
            if train:
                epsilon = self.compute_epsilon(self.total_frames_trained)
                #print "Current annealed epsilon is %f at %d frames" % (epsilon, self.total_frames_trained)

            # Some times random action is chosen
            if random.uniform(0, 1) < epsilon:
                action = random.choice(range(self.number_of_actions))
                #print "Chose random action %d" % action
            # Usually neural net chooses the best action
            else:
                action = self.predict_best_action(self.memory.get_last_state())
                #print "Neural net chose action %d" % int(action)

            # Make the move
            reward = self.ale.move(action)
            if reward:
                print "    Got reward of %d!!!" % reward
                reward = 1
            game_score += reward
            frames_played += 1
            #print "Played frame %d" % frames_played

            # Store new information to memory
            self.ale.store_step(action)

            # Only if training
            if train:
                # Increase total frames only when training
                self.total_frames_trained += 1
                # Train neural net with random minibatch
                minibatch = self.memory.get_minibatch(self.minibatch_size)
                self.train_minibatch(minibatch)
                #print "Trained minibatch of size %d" % self.minibatch_size

            # Play until game is over
            if self.ale.game_over:
                print "   Game over!!! Score = %d" % game_score
                # After "game over" increase the number of games played
                game_scores.append(game_score)
                game_score = 0
                # And do stuff after end game
                self.ale.end_game()
                self.ale.new_game()

        # reset the game just in case
        self.ale.end_game()

        return game_scores

    def run(self):
        # Play number of random games and pick random states to calculate Q-values for
        print "Playing %d games with random policy" % self.init_frames
        self.play_games(self.init_frames, False, 1)
        self.random_states = self.memory.get_minibatch(
            self.nr_random_states)[0]

        # Open log file and write header
        log_file = open(
            "../log/scores" + time.strftime("%Y-%m-%d-%H-%M") + ".csv", "w")
        log_file.write(
            "epoch,nr_games,sum_score,average_score,nr_frames_tested,average_qvalue,total_frames_trained,epsilon,memory_size\n"
        )

        for epoch in range(1, self.epochs + 1):
            print "Epoch %d:" % epoch
            # play number of frames with training and epsilon annealing
            print "  Training for %d frames" % self.training_frames
            self.play_games(self.training_frames, True, None)
            # play number of frames without training and without epsilon annealing
            print "  Testing for %d frames" % self.testing_frames
            game_scores = self.play_games(self.testing_frames, False, 0.05)

            # calculate Q-values
            qvalues = self.nnet.predict(self.random_states)
            assert qvalues.shape[0] == self.nr_random_states
            assert qvalues.shape[1] == self.number_of_actions
            max_qvalues = np.max(qvalues, axis=1)
            assert max_qvalues.shape[0] == self.nr_random_states
            assert len(max_qvalues.shape) == 1
            avg_qvalue = np.mean(max_qvalues)

            # calculate average scores
            sum_score = sum(game_scores)
            nr_games = len(game_scores)
            avg_score = np.mean(game_scores)
            epsilon = self.compute_epsilon(self.total_frames_trained)

            # log average scores in file
            log_file.write(
                "%d,%d,%f,%f,%d,%f,%d,%f,%d\n" %
                (epoch, nr_games, sum_score, avg_score, self.testing_frames,
                 avg_qvalue, self.total_frames_trained, epsilon,
                 self.memory.count))
            log_file.flush()

        log_file.close()
Exemplo n.º 18
0
 def __init__(self):
     self.memory = MemoryD(self.memory_size)
     self.ale = ALE(self.memory)
     self.nnet = NeuralNet(self.state_size, self.number_of_actions,
                           "ai/deepmind-layers.cfg",
                           "ai/deepmind-params.cfg", "layer4")
Exemplo n.º 19
0
 def __init__(self):
     self.memory = MemoryD(self.memory_size)
     self.ale = ALE(self.memory)
     self.nnet = NeuralNet(self.state_size, self.number_of_actions, "ai/deepmind-layers.cfg", "ai/deepmind-params.cfg", "layer4")
Exemplo n.º 20
0
class Main:
    # How many transitions to keep in memory?
    memory_size = 100000

    # Memory itself
    memory = None

    # Neural net
    nnet = None

    # Communication with ALE
    ale = None

    # Size of the mini-batch which will be sent to learning in Theano
    minibatch_size = None

    # Number of possible actions in a given game
    number_of_actions = None

    def __init__(self):
        self.memory = MemoryD(self.memory_size)
        self.minibatch_size = 32  # Given in the paper
        self.number_of_actions = 4  # Game "Breakout" has 4 possible actions

        # Properties of the neural net which come from the paper
        self.nnet = NeuralNet([1, 4, 84, 84],
                              filter_shapes=[[16, 4, 8, 8], [32, 16, 4, 4]],
                              strides=[4, 2],
                              n_hidden=256,
                              n_out=self.number_of_actions)
        self.ale = ALE(self.memory)

    def compute_epsilon(self, frames_played):
        """
        From the paper: "The behavior policy during training was epsilon-greedy
        with annealed linearly from 1 to 0.1 over the first million frames, and fixed at 0.1 thereafter."
        @param frames_played: How far are we with our learning?
        """
        return max(0.9 - frames_played / self.memory_size, 0.1)

    def play_games(self, n):
        """
        Main cycle: plays many games and many frames in each game. Also learning is performed.
        @param n: total number of games allowed to play
        """

        games_to_play = n
        games_played = 0
        frames_played = 0

        # Play games until maximum number is reached
        while games_played < games_to_play:
            # Start a new game
            self.ale.new_game()

            # Play until game is over
            while not self.ale.game_over:

                # Epsilon decreases over time
                epsilon = self.compute_epsilon(frames_played)

                # Some times random action is chosen
                if random.uniform(0, 1) < epsilon:
                    action = random.choice(range(self.number_of_actions))
                    print "chose randomly ", action

                # Usually neural net chooses the best action
                else:
                    print "chose by neural net"
                    action = self.nnet.predict_best_action(
                        [self.memory.get_last_state()])
                    print action

                # Make the move
                self.ale.move(action)

                # Store new information to memory
                self.ale.store_step(action)

                # Start a training session

                self.nnet.train(self.memory.get_minibatch(self.minibatch_size))

            # After "game over" increase the number of games played
            games_played += 1

            # And do stuff after end game (store information, let ALE know etc)
            self.ale.end_game()