예제 #1
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, update_rule,
                 batch_accumulator, state_count, input_scale=255.0):
                     
        self.state_count=state_count
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0
        
        self.l_out = self.build_nature_network_dnn(input_width, input_height,
                                        num_actions, num_frames, batch_size)
        
        if self.freeze_interval > 0:
            self.next_l_out = self.build_nature_network_dnn(input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.matrix('states')
        next_states = T.matrix('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

#buferis inputu viso batch
        self.states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#buferis i koki state patenka visiem
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

#po 1 priimta action kiekvienam episode
        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#?? turbut 0 ir 1, ar paskutine verte ar ne
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

#neaisku
        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))


#
        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)

        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho,                                              self.rms_epsilon)
                                              
        elif update_rule == 'adagrad':
            updates = lasagne.updates.adagrad(loss, params, self.lr,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
            
        elif update_rule == 'momentum':
            updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum)

        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
예제 #2
0
    def __init__(self, stateSize, actionSize, numFrames, batchSize, discount,
                 rho, momentum, learningRate, rmsEpsilon, rng, updateRule,
                 batchAccumulator, freezeInterval):
        self.stateSize = stateSize
        self.actionSize = actionSize
        self.numFrames = numFrames
        self.batchSize = batchSize
        self.discount = discount
        self.rho = rho
        self.momentum = momentum
        self.learningRate = learningRate
        self.rmsEpsilon = rmsEpsilon
        self.rng = rng
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.freezeInterval = freezeInterval

        lasagne.random.set_rng(self.rng)

        self.updateCounter = 0

        self.lOut = self.buildNetwork(self.stateSize, self.actionSize,
                                      self.numFrames, self.batchSize)

        if self.freezeInterval > 0:
            self.nextLOut = self.buildNetwork(self.stateSize, self.actionSize,
                                              self.numFrames, self.batchSize)
            self.resetQHat()

        states = T.ftensor3('states')
        nextStates = T.ftensor3('nextStates')
        rewards = T.fcol('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Shared variables for teaching from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) states, along with the chosen action and resulting
        # reward and termninal status.
        self.states_shared = theano.shared(
            numpy.zeros((self.batchSize, self.numFrames + 1, self.stateSize),
                        dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(numpy.zeros(
            (self.batchSize, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(numpy.zeros((self.batchSize, 1),
                                                        dtype='int32'),
                                            broadcastable=(False, True))
        self.terminals_shared = theano.shared(numpy.zeros((self.batchSize, 1),
                                                          dtype='int32'),
                                              broadcastable=(False, True))

        # Shared variable for a single state, to calculate qVals
        self.state_shared = theano.shared(
            numpy.zeros((self.numFrames, self.stateSize),
                        dtype=theano.config.floatX))

        qVals = lasagne.layers.get_output(self.lOut, states)

        if self.freezeInterval > 0:
            nextQVals = lasagne.layers.get_output(self.nextLOut, nextStates)
        else:
            nextQVals = lasagne.layers.get_output(self.lOut, nextStates)
            nextQVals = theano.gradient.disconnected_grad(nextQVals)

        # Cast terminals to floatX
        terminalsX = terminals.astype(theano.config.floatX)
        # T.eq(a,b) returns a variable representing the nogical
        # EQuality (a==b)
        actionmask = T.eq(
            T.arange(self.actionSize).reshape((1, -1)), actions.reshape(
                (-1, 1))).astype(theano.config.floatX)

        target = (rewards + (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(nextQVals, axis=1, keepdims=True))
        output = (qVals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        # no if clip delta, since clip-delta=0

        loss = (diff**2)

        if self.batchAccumulator == 'sum':
            loss = T.sum(loss)
        elif self.batchAccumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError('Bad accumulator: {}'.format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.lOut)
        train_givens = {
            states: self.states_shared[:, :-1],
            nextStates: self.states_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        if self.updateRule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.learningRate,
                                              self.rho, self.rmsEpsilon)

        elif self.updateRule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.learningRate,
                                       self.rho, self.rmsEpsilon)
        else:
            raise ValueError('Unrecognized update: {}'.format(updateRule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss],
                                      updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape(
                (1, self.numFrames, self.stateSize))
        }

        # self._q_vals=theano.function([],qVals[0], givens=q_givens)
        self._q_vals = theano.function([], qVals[0], givens=q_givens)
예제 #3
0
        target = reward + self.discount * next_q_val[next_action]
        diff = target - q_vals[action]
        loss = 0.5 * diff ** 2

        params = lasagne.layers.helper.get_all_params(self.l_out)  
        givens = {
            state: self.state_shared,
            action: self.action_shared,
            reward: self.reward_shared,
            next_state: self.next_state_shared,
            next_action: self.next_action_shared
        }

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None, self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals, givens={state: self.state_shared})

    def build_network(self, network_type, input_width, input_height, output_dim, num_frames):
        if network_type == "large":
예제 #4
0
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 clip_delta,
                 freeze_interval,
                 batch_size,
                 update_rule,
                 batch_accumulator,
                 state_count,
                 input_scale=255.0):

        self.state_count = state_count
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_nature_network_dnn(input_width, input_height,
                                                   num_actions, num_frames,
                                                   batch_size)

        if self.freeze_interval > 0:
            self.next_l_out = self.build_nature_network_dnn(
                input_width, input_height, num_actions, num_frames, batch_size)
            self.reset_q_hat()

        states = T.matrix('states')
        next_states = T.matrix('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        #buferis inputu viso batch
        self.states_shared = theano.shared(
            np.zeros((batch_size, state_count), dtype=theano.config.floatX))

        #buferis i koki state patenka visiem
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, state_count), dtype=theano.config.floatX))

        #po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        #po 1 priimta action kiekvienam episode
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        #?? turbut 0 ir 1, ar paskutine verte ar ne
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1),
                                                       dtype='int32'),
                                              broadcastable=(False, True))

        #paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        #neaisku
        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff**2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff**2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

#
        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)

        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss, params, self.lr, self.rho,
                                           self.rho, self.rms_epsilon)

        elif update_rule == 'adagrad':
            updates = lasagne.updates.adagrad(loss, params, self.lr,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)

        elif update_rule == 'momentum':
            updates = lasagne.updates.momentum(loss, params, self.lr,
                                               self.momentum)

        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
예제 #5
0
    def __init__(self, input_width, input_height, avail_actions, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, train_all, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.avail_actions = avail_actions
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
        self.train_all = train_all

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        print "num_actions: " + str(num_actions)
        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
예제 #6
0
def main(game_name, network_type, updates_method,
         target_network_update_frequency, initial_epsilon, final_epsilon,
         test_epsilon, final_exploration_frame, replay_start_size,
         deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate,
         deepmind_rmsprop_rho, rmsprop_epsilon, rmsprop_learning_rate,
         rmsprop_rho, phi_type, phi_method, epoch_size, n_training_epochs,
         n_test_epochs, visualize, record_dir, show_mood, replay_memory_size,
         no_replay, repeat_action, skip_n_frames_after_lol,
         max_actions_per_game, weights_dir, algo_initial_state_file,
         log_frequency, theano_verbose):
    args = locals()

    if theano_verbose:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    if game_name == 'simple_breakout':
        game = simple_breakout.SimpleBreakout()

        class P(object):
            def __init__(self):
                self.screen_size = (12, 12)

            def __call__(self, frames):
                return frames

        phi = P()
    else:
        ale = ag.init(game=game_name,
                      display_screen=(visualize == 'ale'),
                      record_dir=record_dir)
        game = ag.ALEGame(ale)
        if phi_type == '4':
            phi = ag.Phi4(method=phi_method)
        elif phi_type == '1':
            phi = ag.Phi(method=phi_method)
        else:
            raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type))

    if network_type == 'nature':
        build_network = network.build_nature
    elif network_type == 'nature_with_pad':
        build_network = network.build_nature_with_pad
    elif network_type == 'nips':
        build_network = network.build_nips
    elif network_type == 'nature_with_pad_he':
        build_network = network.build_nature_with_pad_he
    elif hasattr(network_type, '__call__'):
        build_network = network_type
    else:
        raise RuntimeError(
            "Unknown network: {network}".format(network=network_type))

    if updates_method == 'deepmind_rmsprop':
        updates = \
            lambda loss, params: u.deepmind_rmsprop(loss, params,
                                                          learning_rate=deepmind_rmsprop_learning_rate,
                                                          rho=deepmind_rmsprop_rho,
                                                          epsilon=deepmind_rmsprop_epsilon)
    elif updates_method == 'rmsprop':
        updates = \
            lambda loss, params: lasagne.updates.rmsprop(loss, params,
                                                         learning_rate=rmsprop_learning_rate,
                                                         rho=rmsprop_rho,
                                                         epsilon=rmsprop_epsilon)
    else:
        raise RuntimeError(
            "Unknown updates: {updates}".format(updates=updates_method))

    replay_memory = dqn.ReplayMemory(
        size=replay_memory_size) if not no_replay else None

    def create_algo():
        algo = dqn.DQNAlgo(game.n_actions(),
                           replay_memory=replay_memory,
                           build_network=build_network,
                           updates=updates,
                           screen_size=phi.screen_size)

        algo.replay_start_size = replay_start_size
        algo.final_epsilon = final_epsilon
        algo.initial_epsilon = initial_epsilon

        algo.log_frequency = log_frequency
        algo.target_network_update_frequency = target_network_update_frequency
        algo.final_exploration_frame = final_exploration_frame
        return algo

    algo_train = create_algo()
    algo_test = create_algo()
    algo_test.final_epsilon = test_epsilon
    algo_test.initial_epsilon = test_epsilon
    algo_test.epsilon = test_epsilon

    import Queue
    algo_train.mood_q = Queue.Queue() if show_mood else None

    if show_mood is not None:
        import Queue
        algo_train.mood_q = Queue.Queue()
        if show_mood == 'plot':
            plot = Plot()
        elif show_mood == "log":
            plot = Log()

        def worker():
            while True:
                item = algo_train.mood_q.get()
                plot.show(item)
                algo_train.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(algo_train))

    if visualize != 'q':
        visualizer = q.GameNoVisualizer()
    else:
        if game_name == 'simple_breakout':
            visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train)
        else:
            visualizer = ag.ALEGameVisualizer(phi.screen_size)

    teacher = q.Teacher(game=game,
                        algo=algo_train,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=False)

    tester = q.Teacher(game=game,
                       algo=algo_test,
                       game_visualizer=visualizer,
                       phi=phi,
                       repeat_action=repeat_action,
                       max_actions_per_game=max_actions_per_game,
                       skip_n_frames_after_lol=skip_n_frames_after_lol,
                       tester=True)

    q.teach_and_test(teacher,
                     tester,
                     n_epochs=n_training_epochs,
                     frames_to_test_on=n_test_epochs * epoch_size,
                     epoch_size=epoch_size,
                     state_dir=weights_dir,
                     algo_initial_state_file=algo_initial_state_file)
예제 #7
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, eta,  
		 params_share=True, double_learning=False, 
		 annealing=False, temp=1.0, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
	self.eta = eta
	self.params_share = params_share
	self.double_learning = double_learning
	self.annealing = annealing
	self.temp0 = temp

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out, self.l_feature, self.l_init = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)

        if self.freeze_interval > 0:
            self.next_l_out, self.next_l_feature, self.next_l_init = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat_share()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
	exp_temp = T.scalar('exploration tuning')

        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) images, along with the chosen action and resulting
        # reward and terminal status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames + 1, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
	self.exp_temp_shared = theano.shared(np.float32(self.temp0)) # default without annealing

        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        feature_vals = lasagne.layers.get_output(self.l_feature, states / input_scale)
        q_params = lasagne.layers.get_all_params(self.l_out)
        q_params_vals = lasagne.layers.get_all_param_values(self.l_out)
 	if self.params_share:
	    w_pi = q_params[-2]
	    b_pi = q_params[-1]
	else:
            params_init = lasagne.layers.get_all_param_values(self.l_init)
	    w_pi = theano.shared(params_init[-2])
	    b_pi = theano.shared(params_init[-1])

        pi_vals = T.nnet.softmax(exp_temp * (T.dot(feature_vals, w_pi) + b_pi))
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
	    if self.double_learning:
	        next_feature_vals = lasagne.layers.get_output(self.l_feature,
                                                    next_states / input_scale)
                next_q_params = lasagne.layers.get_all_params(self.l_out)
                next_q_params_vals = lasagne.layers.get_all_param_values(self.l_out)
 	        if self.params_share:
	            next_w_pi = next_q_params[-2]
	            next_b_pi = next_q_params[-1]
	        else:
                    next_params_init = lasagne.layers.get_all_param_values(self.l_init)
	            next_w_pi = theano.shared(next_params_init[-2])
	            next_b_pi = theano.shared(next_params_init[-1])
                next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi)) 
		next_pi_vals = theano.gradient.disconnected_grad(next_pi_vals)
	    else:
	        next_feature_vals = lasagne.layers.get_output(self.next_l_feature,
                                                    next_states / input_scale)
                next_q_params = lasagne.layers.get_all_params(self.next_l_out)
                next_q_params_vals = lasagne.layers.get_all_param_values(self.next_l_out)
 	        if self.params_share:
	            next_w_pi = next_q_params[-2]
	            next_b_pi = next_q_params[-1]
	        else:
                    next_params_init = lasagne.layers.get_all_param_values(self.next_l_init)
	            next_w_pi = theano.shared(next_params_init[-2])
	            next_b_pi = theano.shared(next_params_init[-1])

                next_pi_vals = T.nnet.softmax(exp_temp * (T.dot(next_feature_vals, next_w_pi) + next_b_pi))       
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards + (T.ones_like(terminalsX) - terminalsX) *
                 self.discount * T.sum(next_q_vals * next_pi_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

	if self.params_share:
            params = lasagne.layers.helper.get_all_params(self.l_out)  
	else:
	    params = lasagne.layers.helper.get_all_params(self.l_out)
	    params.append(next_w_pi)
	    params.append(next_b_pi)

        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared,
	    exp_temp: self.exp_temp_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width))
        }

        pi_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width)),
	    exp_temp: self.exp_temp_shared
        }

        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
        self._pi_vals = theano.function([], pi_vals[0], givens=pi_givens)

	grad_fc_w = T.grad(loss, self.l_out.W)
	self._grad = theano.function([], outputs=grad_fc_w,
				    givens=train_givens)
예제 #8
0
    def __init__(self, input_width, input_height, num_channels, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, network_params, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_channels = num_channels
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        self.lstm = None
        self.next_lstm = None

        logging.debug('network parameters', network_params)
        self.network_params = network_params

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        networks = self.build_network(network_type, num_channels, input_width, input_height,
                                        num_actions, num_frames, None)
        if isinstance(networks, tuple):
            self.l_out = networks[0]
            self.lstm = networks[1]
        else:
            self.l_out = networks

        # theano.compile.function_dump('network.dump', self.l_out)
        if self.freeze_interval > 0:
            next_networks = self.build_network(network_type, num_channels, input_width,
                                                 input_height, num_actions,
                                                 num_frames, None)

            if isinstance(next_networks, tuple):
                self.next_l_out = next_networks[0]
                self.next_lstm = next_networks[1]
            else:
                self.next_l_out = next_networks

            self.reset_q_hat()

        # This really really needs to be floats for now.
        # It makes sense if they use it for computations
        btensor5 = T.TensorType(theano.config.floatX, (False,) * 5)
        states = btensor5('states')
        next_states = btensor5('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Apparently needed for some layers with a variable input size
        # Weird, because the others just allow a None batch size,
        # but let's just play safe for now
        # For now, it should always look exactly like states
        # (n_batch, n_time_steps)
        # mask = T.imatrix('mask')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='states')

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='next_states')

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True), name='rewards')

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True), name='actions')

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # self.mask_shared = theano.shared(np.ones((batch_size, num_frames),
        #     dtype='int32'))

        # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale)

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
                # mask_input=mask)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(target.shape[0]),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        # print params
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        updates = update_for(params)

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        # # Super mega shady stuff
        # # Somehow an update sneaks in for cell and hid. Kill it with fire
        if self.lstm:
            delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']]
            # print delete_keys
            for key in delete_keys:
                del updates[key]

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
예제 #9
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 use_double, batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.use_double = use_double
        self.rng = rng

        # Using Double DQN is pointless without periodic freezing
        if self.use_double:
            assert self.freeze_interval > 0
            # pass

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        
        if self.freeze_interval > 0:
            # Nature. If using periodic freezing
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            # NIPS
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        if self.use_double:
            maxaction = T.argmax(q_vals, axis=1, keepdims=False)
            temptargets = next_q_vals[T.arange(batch_size),maxaction].reshape((-1, 1))
            target = (rewards +
                      (T.ones_like(terminals) - terminals) *
                      self.discount * temptargets)
        else:
            target = (rewards +
                      (T.ones_like(terminals) - terminals) *
                      self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)  
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        def inspect_inputs(i, node, fn):
            if ('maxand' not in str(node).lower() and '12345' not in str(node)):
                return
            print i, node, "input(s) value(s):", [input[0] for input in fn.inputs],
            raw_input('press enter')

        def inspect_outputs(i, node, fn):
            if ('maxand' not in str(node).lower() and '12345' not in str(node)):
                return
            if '12345' in str(node):
                print "output(s) value(s):", [np.asarray(output[0]) for output in fn.outputs]
            else:
                print "output(s) value(s):", [output[0] for output in fn.outputs]
            raw_input('press enter')

        if False:
            self._train = theano.function([], [loss, q_vals], updates=updates,
                                          givens=givens, mode=theano.compile.MonitorMode(
                            pre_func=inspect_inputs,
                            post_func=inspect_outputs))
            theano.printing.debugprint(target)
        else:
            self._train = theano.function([], [loss, q_vals], updates=updates,
                                          givens=givens)
        if False:
            self._q_vals = theano.function([], q_vals,
                                           givens={states: self.states_shared}, mode=theano.compile.MonitorMode(
                            pre_func=inspect_inputs,
                            post_func=inspect_outputs))
        else:
            self._q_vals = theano.function([], q_vals,
                                           givens={states: self.states_shared})
예제 #10
0
def main(game_name, network_type, updates_method,
         target_network_update_frequency,
         initial_epsilon, final_epsilon, test_epsilon, final_exploration_frame, replay_start_size,
         deepmind_rmsprop_epsilon, deepmind_rmsprop_learning_rate, deepmind_rmsprop_rho,
         rmsprop_epsilon, rmsprop_learning_rate, rmsprop_rho,
         phi_type, phi_method,
         epoch_size, n_training_epochs, n_test_epochs,
         visualize, record_dir, show_mood,
         replay_memory_size, no_replay,
         repeat_action, skip_n_frames_after_lol, max_actions_per_game,
         weights_dir, algo_initial_state_file,
         log_frequency, theano_verbose):
    args = locals()

    if theano_verbose:
        theano.config.compute_test_value = 'warn'
        theano.config.exception_verbosity = 'high'
        theano.config.optimizer = 'fast_compile'

    if game_name == 'simple_breakout':
        game = simple_breakout.SimpleBreakout()
        class P(object):
            def __init__(self):
                self.screen_size = (12, 12)

            def __call__(self, frames):
                return frames
        phi = P()
    else:
        ale = ag.init(game=game_name, display_screen=(visualize == 'ale'), record_dir=record_dir)
        game = ag.ALEGame(ale)
        if phi_type == '4':
            phi = ag.Phi4(method=phi_method)
        elif phi_type == '1':
            phi = ag.Phi(method=phi_method)
        else:
            raise RuntimeError("Unknown phi: {phi}".format(phi=phi_type))

    if network_type == 'nature':
        build_network = network.build_nature
    elif network_type == 'nature_with_pad':
        build_network = network.build_nature_with_pad
    elif network_type == 'nips':
        build_network = network.build_nips
    elif network_type == 'nature_with_pad_he':
        build_network = network.build_nature_with_pad_he
    elif hasattr(network_type, '__call__'):
        build_network = network_type
    else:
        raise RuntimeError("Unknown network: {network}".format(network=network_type))


    if updates_method == 'deepmind_rmsprop':
        updates = \
            lambda loss, params: u.deepmind_rmsprop(loss, params,
                                                          learning_rate=deepmind_rmsprop_learning_rate,
                                                          rho=deepmind_rmsprop_rho,
                                                          epsilon=deepmind_rmsprop_epsilon)
    elif updates_method == 'rmsprop':
        updates = \
            lambda loss, params: lasagne.updates.rmsprop(loss, params,
                                                         learning_rate=rmsprop_learning_rate,
                                                         rho=rmsprop_rho,
                                                         epsilon=rmsprop_epsilon)
    else:
        raise RuntimeError("Unknown updates: {updates}".format(updates=updates_method))

    replay_memory = dqn.ReplayMemory(size=replay_memory_size) if not no_replay else None

    def create_algo():
        algo = dqn.DQNAlgo(game.n_actions(),
                               replay_memory=replay_memory,
                               build_network=build_network,
                               updates=updates,
                               screen_size=phi.screen_size)

        algo.replay_start_size = replay_start_size
        algo.final_epsilon = final_epsilon
        algo.initial_epsilon = initial_epsilon

        algo.log_frequency = log_frequency
        algo.target_network_update_frequency = target_network_update_frequency
        algo.final_exploration_frame = final_exploration_frame
        return algo

    algo_train = create_algo()
    algo_test = create_algo()
    algo_test.final_epsilon = test_epsilon
    algo_test.initial_epsilon = test_epsilon
    algo_test.epsilon = test_epsilon


    import Queue
    algo_train.mood_q = Queue.Queue() if show_mood else None

    if show_mood is not None:
        import Queue
        algo_train.mood_q = Queue.Queue()
        if show_mood == 'plot':
            plot = Plot()
        elif show_mood == "log":
            plot = Log()

        def worker():
            while True:
                item = algo_train.mood_q.get()
                plot.show(item)
                algo_train.mood_q.task_done()

        import threading
        t = threading.Thread(target=worker)
        t.daemon = True
        t.start()

    print(str(algo_train))

    if visualize != 'q':
        visualizer = q.GameNoVisualizer()
    else:
        if game_name == 'simple_breakout':
            visualizer = simple_breakout.SimpleBreakoutVisualizer(algo_train)
        else:
            visualizer = ag.ALEGameVisualizer(phi.screen_size)

    teacher = q.Teacher(game=game,
                        algo=algo_train,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=False)

    tester = q.Teacher(game=game,
                        algo=algo_test,
                        game_visualizer=visualizer,
                        phi=phi,
                        repeat_action=repeat_action,
                        max_actions_per_game=max_actions_per_game,
                        skip_n_frames_after_lol=skip_n_frames_after_lol,
                        tester=True)

    q.teach_and_test(teacher, tester, n_epochs=n_training_epochs,
                     frames_to_test_on=n_test_epochs * epoch_size,
                     epoch_size=epoch_size,
                     state_dir=weights_dir,
                     algo_initial_state_file=algo_initial_state_file)
예제 #11
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network( network_type, input_width, input_height,
                                         num_actions, num_frames, batch_size )
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network( network_type, input_width, input_height,
                                                  num_actions, num_frames, batch_size )
            self.reset_q_hat( )

        states, next_states = T.tensor4( 'states' ), T.tensor4( 'next_states' )
        actions, rewards = T.icol( 'actions' ), T.col( 'rewards' )
        terminals = T.icol( 'terminals' )

        self.states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ),
                                                      dtype = theano.config.floatX ) )
        self.next_states_shared = theano.shared( np.zeros( ( batch_size, num_frames, input_height, input_width ),
                                                           dtype = theano.config.floatX ) )
        self.rewards_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = theano.config.floatX ),
                                             broadcastable = ( False, True ) )
        self.actions_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ),
                                             broadcastable = ( False, True ) )
        self.terminals_shared = theano.shared( np.zeros( ( batch_size, 1 ), dtype = 'int32' ),
                                               broadcastable = ( False, True ) )
## Get learned Q-values
        q_vals_test = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = True )
        # q_vals_test = theano.gradient.disconnected_grad( q_vals_test )

        q_vals_train = lasagne.layers.get_output( self.l_out, states / input_scale, deterministic = False )
        
        if self.freeze_interval > 0:
            target_q_vals = lasagne.layers.get_output( self.next_l_out,
                                                       next_states / input_scale, deterministic = True)
        else:
            target_q_vals = lasagne.layers.get_output( self.l_out,
                                                       next_states / input_scale, deterministic = True)
            target_q_vals = theano.gradient.disconnected_grad( target_q_vals )
## The traget depends on the received rewards and the discounted future
##   reward stream for the given action in the current state.
        target = ( rewards + ( T.ones_like( terminals ) - terminals ) *
                             self.discount * T.max( target_q_vals, axis = 1, keepdims = True ) )
##  target - b x 1, where b is batch size.
##  q_vals - b x A, where A is the number of outputs of the Q-net
## Theano differentiates indexed (and reduced) arrays in a clever manner:
##  it sets all left out gradients to zero. THIS IS CORRECT!
## \nabla_\theta diff = - 1_{a = a_j} \nabla Q( s, a_j, \theta) \,.
        diff = target - q_vals_train[ T.arange( batch_size ), actions.reshape( ( -1, ) ) ].reshape( ( -1, 1 ) )

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)  
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None, self.momentum)

        self._train = theano.function([], loss, updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals_test, givens={states: self.states_shared})
예제 #12
0
    def __init__(self,
                 model,
                 slen,
                 gamma=0.995,
                 n_hidden1=25,
                 n_hidden2=25,
                 learning_rate=0.0002,
                 freeze_interval=1000,
                 momentum=0.0,
                 learner_type="DDQN",
                 minibatch_size=20,
                 train_interval=100):
        self.freeze_interval = freeze_interval
        self.freeze_counter = 0
        self.slen = slen
        #train data
        self.minibatch_size = minibatch_size
        self.train_interval = train_interval
        self.train_set_x = theano.shared(numpy.zeros(
            [minibatch_size * train_interval, slen],
            dtype=theano.config.floatX),
                                         borrow=True)
        self.train_set_y = theano.shared(numpy.zeros(
            [minibatch_size * train_interval, slen],
            dtype=theano.config.floatX),
                                         borrow=True)
        #variables
        self.index = T.lscalar()
        self.s = T.matrix('s')  # the data is presented as rasterized images
        self.sp = T.matrix('sp')  #s prime
        self.rng = numpy.random.RandomState(None)
        if learner_type == "DDQN":
            self.classifier = MLP_DDQN(rng=self.rng,
                                       input1=self.s,
                                       input2=self.sp,
                                       n_in=slen,
                                       n_hidden1=n_hidden1,
                                       n_hidden2=n_hidden2,
                                       n_out=3,
                                       model=model,
                                       gamma=gamma)
        elif learner_type == "DQN":
            self.classifier = MLP_DQN(rng=self.rng,
                                      input1=self.s,
                                      input2=self.sp,
                                      n_in=slen,
                                      n_hidden1=n_hidden1,
                                      n_hidden2=n_hidden2,
                                      n_out=3,
                                      model=model,
                                      gamma=gamma)
        self.cost_v = self.classifier.cost_v
        self.cost = self.classifier.cost
        self.rmsprop = RMSProp(self.classifier.params)
        self.gparams = [
            T.grad(self.cost, param) for param in self.classifier.params
        ]
        #        self.updates_no_m = self.rmsprop.updates(self.classifier.params,self.gparams,learning_rate,0.0)
        #        self.updates = self.rmsprop.updates(self.classifier.params,self.gparams,learning_rate,momentum)
        self.updates = deepmind_rmsprop(self.gparams, self.classifier.params,
                                        learning_rate, momentum, 1e-4)
        self.model = (self.classifier.Wh1.get_value(borrow=True),
                      self.classifier.Wh2.get_value(borrow=True),
                      self.classifier.bh1.get_value(borrow=True),
                      self.classifier.bh2.get_value(borrow=True),
                      self.classifier.OW.get_value(borrow=True),
                      self.classifier.Ob.get_value(borrow=True))
        self.model_to_save = (self.classifier.Wh1.get_value(borrow=True),
                              self.classifier.Wh2.get_value(borrow=True),
                              self.classifier.bh1.get_value(borrow=True),
                              self.classifier.bh2.get_value(borrow=True),
                              self.classifier.OW.get_value(borrow=True),
                              self.classifier.Ob.get_value(borrow=True))
        self.to_save_id = 0
        self.saved = True
        self.train_model_prioritize = theano.function(
            inputs=[self.index],
            outputs=self.cost_v,
            updates=self.updates,
            givens={
                self.s:
                self.train_set_x[self.index * minibatch_size:(self.index + 1) *
                                 minibatch_size],
                self.sp:
                self.train_set_y[self.index * minibatch_size:(self.index + 1) *
                                 minibatch_size]
            })

        self.train_model = theano.function(
            inputs=[self.index],
            outputs=self.cost,
            updates=self.updates,
            givens={
                self.s:
                self.train_set_x[self.index * minibatch_size:(self.index + 1) *
                                 minibatch_size],
                self.sp:
                self.train_set_y[self.index * minibatch_size:(self.index + 1) *
                                 minibatch_size]
            })
        self.report_action = theano.function(inputs=[self.s],
                                             outputs=self.classifier.aidx,
                                             allow_input_downcast=True)
        self.action = theano.function(inputs=[self.s],
                                      outputs=T.argmax(self.classifier.Qs),
                                      allow_input_downcast=True)
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule, lambda_reg,
                 batch_accumulator, pretrained_net, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
        self.lambda_reg = lambda_reg

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_in, self.l_act_in, self.l_out, self.pred_z, self.true_z = \
                                        self.build_network(network_type, \
                                        input_width, input_height, num_actions,\
                                        num_frames, batch_size)

        if self.freeze_interval > 0:
            self.next_l_in, self.next_l_act_in, self.next_l_out, _d, _d = \
                                self.build_network(network_type, input_width, \
                                input_height, num_actions, num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.imatrix('actions')
        terminals = T.icol('terminals')

        # Shared variables for training from a minibatch of replayed
        # state transitions, each consisting of num_frames + 1 (due to
        # overlap) images, along with the chosen action and resulting
        # reward and terminal status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames*2+1, input_height, input_width),
                     dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((batch_size, num_frames), dtype='int32')
            )
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # Shared variable for a single state, to calculate q_vals.
        self.state_shared = theano.shared(
            np.zeros((num_frames*2, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals, z_pred, z_true = lasagne.layers.get_output(
                                    [self.l_out, self.pred_z, self.true_z],
                                    inputs = {self.l_in: states / input_scale,
                                        self.l_act_in: actions}
                                )
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(
                                    self.next_l_out,
                                    {self.next_l_in: next_states / input_scale, 
                                     self.next_l_act_in: actions}
                                    )
        else:
            next_q_vals = lasagne.layers.get_output(
                                    self.l_out,
                                    {self.l_in: next_states / input_scale, 
                                     self.l_act_in: actions}
                                    )
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                actions[:, 0].reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output
        diff_reg = z_true - z_pred

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = loss + 0.5 * self.lambda_reg * (diff_reg ** 2).sum(axis=1)

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params([self.l_out, self.pred_z, self.true_z])  
        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)
        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames*2,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
예제 #14
0
        elif o in ("--dqn.network",):
            if a == 'nature':
                d["dqn.network"] = network.build_nature
            if a == 'nature_with_pad':
                d["dqn.network"] = network.build_nature_with_pad
            elif a == 'nips':
                d["dqn.network"] = network.build_nips
            elif a == 'nature_dnn':
                d["dqn.network"] = network.build_nature_dnn
            elif a == 'nips_dnn':
                d["dqn.network"] = network.build_nips_dnn
        elif o in ("--dqn.updates",):
            import updates
            if a == 'deepmind_rmsprop':
                d["dqn.updates"] = \
                    lambda loss, params: updates.deepmind_rmsprop(loss, params, learning_rate=.00025, rho=.95, epsilon=.01)
            elif a == 'rmsprop':
                d["dqn.updates"] = \
                    lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=.0002, rho=.95, epsilon=1e-6)
        else:
            assert False, "unhandled option"

    import pprint
    pp = pprint.PrettyPrinter(depth=2)
    print(optlist)
    print(args)
    print(sys.argv)
    print("")
    pp.pprint(d)

    main(**d)
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 clip_delta,
                 freeze_interval,
                 batch_size,
                 network_type,
                 update_rule,
                 batch_accumulator,
                 rng,
                 input_scale=255.0,
                 double=False,
                 transition_length=4):

        if double:
            print 'USING DOUBLE DQN'
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width,
                                        input_height, num_actions, num_frames,
                                        batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states_t')
        actions = T.icol('actions_t')
        target = T.col('evaluation_t')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))
        self.target_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                           broadcastable=(False, True))

        self.states_transition_shared = theano.shared(
            np.zeros((batch_size, transition_length * 2, num_frames,
                      input_height, input_width),
                     dtype=theano.config.floatX))
        self.states_one_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        """get Q(s)   batch_size = 1 """
        q1_givens = {
            states:
            self.states_one_shared.reshape(
                (1, self.num_frames, self.input_height, self.input_width))
        }
        self._q1_vals = theano.function([], q_vals[0], givens=q1_givens)
        """get Q(s)   batch_size = batch size """
        q_batch_givens = {
            states:
            self.states_shared.reshape((self.batch_size, self.num_frames,
                                        self.input_height, self.input_width))
        }
        self._q_batch_vals = theano.function([], q_vals, givens=q_batch_givens)

        action_mask = T.eq(
            T.arange(num_actions).reshape((1, -1)), actions.reshape(
                (-1, 1))).astype(theano.config.floatX)

        q_s_a = (q_vals * action_mask).sum(axis=1).reshape((-1, 1))
        """ get Q(s,a)   batch_size = batch size """
        q_s_a_givens = {
            states:
            self.states_shared.reshape((self.batch_size, self.num_frames,
                                        self.input_height, self.input_width)),
            actions:
            self.actions_shared
        }
        self._q_s_a_vals = theano.function([], q_s_a, givens=q_s_a_givens)

        if self.freeze_interval > 0:
            q_target_vals = lasagne.layers.get_output(self.next_l_out,
                                                      states / input_scale)
        else:
            q_target_vals = lasagne.layers.get_output(self.l_out,
                                                      states / input_scale)
            q_target_vals = theano.gradient.disconnected_grad(q_target_vals)

        if not double:
            q_target = T.max(q_target_vals, axis=1)
        else:
            greedy_actions = T.argmax(q_vals, axis=1)
            q_target_mask = T.eq(
                T.arange(num_actions).reshape((1, -1)),
                greedy_actions.reshape((-1, 1)).astype(theano.config.floatX))
            q_target = (q_target_vals * q_target_mask).sum(axis=1).reshape(
                (-1, 1))
        """get Q target Q'(s,a') for a batch of transitions  batch size = batch_size * transition length"""
        q_target_transition_givens = {
            states:
            self.states_transition_shared.reshape(
                (batch_size * transition_length * 2, self.num_frames,
                 self.input_height, self.input_width))
        }
        self._q_target = theano.function([],
                                         q_target.reshape(
                                             (batch_size,
                                              transition_length * 2)),
                                         givens=q_target_transition_givens)
        """get Q target_vals Q'(s) for a batch of transitions  batch size = batch_size * transition length"""
        self._q_target_vals = theano.function(
            [],
            q_target_vals.reshape(
                (batch_size, transition_length * 2, num_actions)),
            givens=q_target_transition_givens)

        diff = q_s_a - target

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part**2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff**2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)
        """Q(s,a) target train()"""
        train_givens = {
            states: self.states_shared,
            actions: self.actions_shared,
            target: self.target_shared
        }
        self._train = theano.function([], [loss],
                                      updates=updates,
                                      givens=train_givens,
                                      on_unused_input='warn')

        self._train2 = theano.function([], [loss],
                                       updates=updates,
                                       givens=train_givens,
                                       on_unused_input='warn')
예제 #16
0
        elif o in ("--dqn.no_replay",):
            d["dqn.no_replay"] = True
        elif o in ("--dqn.network",):
            if a == 'nature':
                d["dqn.network"] = network.build_nature
            elif a == 'nips':
                d["dqn.network"] = network.build_nips
            elif a == 'nature_dnn':
                d["dqn.network"] = network.build_nature_dnn
            elif a == 'nips_dnn':
                d["dqn.network"] = network.build_nips_dnn
        elif o in ("--dqn.updates",):
            import updates
            if a == 'deepmind_rmsprop':
                d["dqn.updates"] = \
                    lambda loss, params: updates.deepmind_rmsprop(loss, params, learning_rate=.00025, rho=.95, epsilon=.1)
            elif a == 'rmsprop':
                d["dqn.updates"] = \
                    lambda loss, params: lasagne.updates.rmsprop(loss, params, learning_rate=.0002, rho=.95, epsilon=1e-6)
        else:
            assert False, "unhandled option"

    import pprint
    pp = pprint.PrettyPrinter(depth=2)
    print(optlist)
    print(args)
    print(sys.argv)
    print("")
    pp.pprint(d)

    main(**d)
예제 #17
0
    def __init__(self,
                 input_width,
                 input_height,
                 num_actions,
                 num_frames,
                 discount,
                 learning_rate,
                 rho,
                 rms_epsilon,
                 momentum,
                 freeze_interval,
                 batch_size,
                 network_type,
                 update_rule,
                 batch_accumulator,
                 input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.gamma = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width,
                                        input_height, num_actions, num_frames,
                                        batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        #terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self.actions_shared = theano.shared(np.zeros((batch_size, 1),
                                                     dtype='int32'),
                                            broadcastable=(False, True))

        # self.terminals_shared = theano.shared(
        #     np.zeros((batch_size, 1), dtype='int32'),
        #     broadcastable=(False,True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + self.gamma * T.max(
            next_q_vals, axis=1, keepdims=True)
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1, ))].reshape((-1, 1))

        if batch_accumulator == 'sum':
            loss = T.sum(diff**2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff**2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            #terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals],
                                      updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([],
                                       q_vals,
                                       givens={states: self.states_shared})
예제 #18
0
파일: q_network.py 프로젝트: torgeha/dqn
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        # print "NETWORK---------------------------"
        # print "input width ", self.input_width
        # print "input height", self.input_height
        # print "num actiuons", self.num_actions
        # print "num frames", self.num_frames
        # print "batch size", self.batch_size
        # print "discount", self.discount
        # print "rho", self.rho
        # print "lr", self.lr
        # print "rms_epsilon", self.rms_epsilon
        # print "momentum", self.momentum
        # print "clip_delta", self.clip_delta
        # print "freeze_ intercal", self.freeze_interval
        # print "rng", self.rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Shared variables for training from a minibatch of replayed state transitions,
        # each consisting of num_frames + 1 (due to overlap) images, along with
        # the chosen action and resulting reward and termnial status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames + 1, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # Shared variable for a single state, to calculate q_vals
        self.state_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)

        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
예제 #19
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, 
                 update_rule, batch_accumulator, randomState, frame_scale=255.0):
        """ Initialize environment

        Arguments:
            environment - the environment (class Env) 
            num_elements_in_batch - list of k integers for the number of each element kept as belief state
            num_actions - int
            discount - float
            learning_rate - float
            rho, rms_epsilon, momentum - float, float, float
            ...
            network_type - string 
            ...           
        """

        self._environment = environment
        
        self._batchSize = batchSize
        self._inputDimensions = self._environment.inputDimensions()
        self._nActions = self._environment.nActions()
        self._df = 0
        self.rho = rho
        self._lr = 0
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState
        
        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batchSize, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))


        q_vals = lasagne.layers.get_output(self.l_out)        
        
        next_q_vals = lasagne.layers.get_output(self.next_l_out)
        
        max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
        
        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
        
        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)
        
            
        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
예제 #20
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, input_scale=255.0, reward_bias=0.):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + reward_bias +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
예제 #21
0
    def __init__(
        self,
        state_width,
        action_width,
        action_bound,
        num_frames,
        discount,
        learning_rate,
        u_lr,
        rho,
        rms_epsilon,
        momentum,
        clip_delta,
        freeze_interval,
        batch_size,
        network_type,
        update_rule,
        batch_accumulator,
        rng,
    ):
        self.state_width = state_width
        self.action_width = action_width
        self.action_bound = action_bound  # TODO: 没用上
        self.num_frames = num_frames  # 就是phi_length
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.u_lr = u_lr
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        ######init u_net######
        """初始化策略网络u_net,包括
        构造state等的符号变量和shared变量,
        构造网络
        给出动作u_acts
        给出网络参数u_params
        """
        states = T.tensor4("states")
        next_states = T.tensor4("next_states")
        rewards = T.col("rewards")
        actions = T.matrix("actions")
        terminals = T.icol("terminals")

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, state_width, 1), dtype=theano.config.floatX)
        )  # 是上面这四个维度

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, state_width, 1), dtype=theano.config.floatX)
        )

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False, True)
        )

        self.actions_shared = theano.shared(
            np.zeros((batch_size, action_width), dtype=theano.config.floatX), broadcastable=(False, True)
        )

        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype="int32"), broadcastable=(False, True))

        self.u_l_out = self.build_u_network(network_type, state_width, 1, action_width, num_frames, batch_size)

        u_acts = lasagne.layers.get_output(self.u_l_out, states)

        u_params = lasagne.layers.helper.get_all_params(self.u_l_out)

        ######------######

        ######init q_net#####
        """初始化评价网络q_net,包括
        构造网络
        给出评价q_vals
        给出下一时刻的评价next_q_vals
        给出td error:diff
        给出网络参数u_params
        有了以上两个经过中间变量q_loss的计算,给出q_updates
        """

        self.q_l_out, in_l1, in_l2 = self.build_q_network(
            network_type, state_width, 1, action_width, num_frames, batch_size
        )

        if self.freeze_interval > 0:  # 这是什么?
            self.next_q_l_out = self.build_q_network(network_type, state_width, 1, action_width, num_frames, batch_size)
            self.reset_q_hat()

        # 输入在下面自己定义,注意有state和actions两个都是输入;输出要是(batch*1)的;注意这里action要用输入的真action
        q_vals = lasagne.layers.get_output(self.q_l_out, {in_l1: states, in_l2: actions})  # TODO: 现在的问题就是这一句该怎么写

        if self.freeze_interval > 0:  # 这是什么?
            next_q_vals = lasagne.layers.get_output(self.next_q_l_out, {in_l1: next_states, in_l2: u_acts})
        else:
            next_q_vals = lasagne.layers.get_output(self.q_l_out, {in_l1: next_states, in_l2: u_acts})
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        # DPG中公式(16)的delta_t,这里和DQN很不同
        diff = (rewards + (T.ones_like(terminals) - terminals) * self.discount * next_q_vals) - q_vals

        # 17,18两个公式自己写吧,要直接卸T.grad对公式里两个求梯度部分自己求了。另外17式怎么出来的

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            q_loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            q_loss = 0.5 * diff ** 2  # 果然目标函数q_loss主要就是diff,是sita的函数。反正是求偏导,等于当做reward是与sita无关的量(定量)。

        if batch_accumulator == "sum":
            q_loss = T.sum(q_loss)  # shape (1)
        elif batch_accumulator == "mean":
            q_loss = T.mean(q_loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        q_params = lasagne.layers.helper.get_all_params(self.q_l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared,
        }

        if update_rule == "deepmind_rmsprop":
            q_updates = deepmind_rmsprop(q_loss, q_params, self.lr, self.rho, self.rms_epsilon)
        elif update_rule == "rmsprop":
            q_updates = lasagne.updates.rmsprop(q_loss, q_params, self.lr, self.rho, self.rms_epsilon)
        elif update_rule == "sgd":
            q_updates = lasagne.updates.sgd(q_loss, q_params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        ######------######
        """
        先给出u_updates(由于u_updates和q_upates有依赖,放在这里才给出)
        给出总的updates,于是就能够训练了
        给出符号函数_train
        给出网络输出的符号函数get_u_acts和get_q_vals
        """
        # 忽略124-136,重写updates;
        # 比如这里q_loss对q_params求导
        # opdac_rmsprop 完成公式(18)
        if batch_accumulator == "sum":
            acm_u_acts = T.sum(u_acts)  # 这里先这么粗暴的写了,在acts只有一维的时候可以这样shape (0)
            acm_q = T.sum(q_vals)
        elif batch_accumulator == "mean":
            acm_u_acts = T.mean(u_acts)
            acm_q = T.mean(q_vals)

        u_updates = opdac_rmsprop(
            acm_q, actions, acm_u_acts, u_params, self.u_lr, False
        )  # TODO: 这里该不该填states,还是该填states_shared

        self.get_u_acts = theano.function([], u_acts, givens={states: self.states_shared})

        # 这个函数get_q_vals或许用不上
        self.get_q_vals = theano.function([], q_vals, givens={states: self.states_shared, actions: self.actions_shared})

        # 另一种表达写法updates=OrderedDict(q_updates,**u_updates),意思都是合并两个字典
        updates = OrderedDict(q_updates.items() + u_updates.items())

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None, self.momentum)

        # 这个就是公式(16)(17)啦
        self._train = theano.function(
            [], [q_loss, q_vals], updates=updates, givens=givens
        )  # 哦!!!你这样拿givens换就可以每次给进来新的值;