示例#1
0
    def __init__(self, n_actions, replay_memory, initial_weights_file=None):
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.ignore_feedback = False
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_frames = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 50

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var = T.tensor4("s0",
                                                                                dtype=theano.config.floatX), T.bmatrix(
            "a0"), T.wcol(
            "r0"), T.tensor4("s1", dtype=theano.config.floatX), T.bcol(
            "future_reward_indicator")
        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_cnn(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256))
        print("Compiling forward.")
        self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True))

        self.network_stale = build_cnn(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256))
        print("Compiling forward stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(self.network_stale, deterministic=True))

        if initial_weights_file is not None:
            with np.load(initial_weights_file) as initial_weights:
                param_values = [initial_weights['arr_%d' % i] for i in range(len(initial_weights.files))]
                lasagne.layers.set_all_param_values(self.network, param_values)

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(out=out,
                                                   out_stale=out_stale,
                                                   a0_var=a0_var,
                                                   r0_var=r0_var,
                                                   future_reward_indicator_var=future_reward_indicator_var,
                                                   gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)
        updates = lasagne.updates.rmsprop(self.loss, params, learning_rate=0.0002, rho=0.95,
                                          epsilon=1e-6)  # TODO RMSPROP in the paper has slightly different definition (see Lua)
        print("Compiling train_fn.")
        self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                        [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale],
                                        updates=updates)
        print("Compiling loss_fn.")
        self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                       self.loss)
示例#2
0
    def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None):
        self.screen_width, self.screen_height = screen_size
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_action = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 1

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var = T.tensor4("s0", dtype=theano.config.floatX)
        a0_var = T.bmatrix("a0")
        r0_var = T.wcol("r0")
        s1_var = T.tensor4("s1", dtype=theano.config.floatX)
        future_reward_indicator_var = T.bcol("future_reward_indicator")

        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256),
                                     screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward.")
        self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True))

        self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256),
                                           screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward_stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(self.network_stale, deterministic=True))

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(out=out,
                                                   out_stale=out_stale,
                                                   a0_var=a0_var,
                                                   r0_var=r0_var,
                                                   future_reward_indicator_var=future_reward_indicator_var,
                                                   gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        print("Compiling train_fn.")
        self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                        [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale],
                                        updates=updates(self.loss, params))
        print("Compiling loss_fn.")
        self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                       self.loss)
示例#3
0
    def __init__(self,
                 n_actions,
                 replay_memory,
                 build_network,
                 updates,
                 screen_size,
                 initial_weights_file=None):
        self.screen_width, self.screen_height = screen_size
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_action = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 1

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var = T.tensor4("s0", dtype=theano.config.floatX)
        a0_var = T.bmatrix("a0")
        r0_var = T.wcol("r0")
        s1_var = T.tensor4("s1", dtype=theano.config.floatX)
        future_reward_indicator_var = T.bcol("future_reward_indicator")

        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_network(
            n_actions=self.n_actions,
            input_var=T.cast(s0_var, 'float32') / np.float32(256),
            screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward.")
        self.forward = theano.function([s0_var],
                                       lasagne.layers.get_output(
                                           self.network, deterministic=True))

        self.network_stale = build_network(
            n_actions=self.n_actions,
            input_var=T.cast(s1_var, 'float32') / np.float32(256),
            screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward_stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(
                                                 self.network_stale,
                                                 deterministic=True))

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(
            out=out,
            out_stale=out_stale,
            a0_var=a0_var,
            r0_var=r0_var,
            future_reward_indicator_var=future_reward_indicator_var,
            gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        print("Compiling train_fn.")
        self.train_fn = theano.function(
            [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var], [
                self.loss, self.err,
                T.transpose(__y),
                T.transpose(__q), out, out_stale
            ],
            updates=updates(self.loss, params))
        print("Compiling loss_fn.")
        self.loss_fn = theano.function(
            [s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
            self.loss)