示例#1
0
    def __init__(self,
                 state_shape,
                 num_actions,
                 epsilon=1.0,
                 epsilon_min=0.1,
                 epsilon_iter=100000,
                 discount=0.99,
                 lrate=1e-3,
                 batch_size=100,
                 q_update_iter=1000,
                 capacity=50000):

        if not isinstance(state_shape, tuple):
            raise AssertionError('state_shape must be of type <tuple>.')
        elif len(state_shape) == 0:
            raise AssertionError('No state space dimensions provided.')
        elif num_actions == 0:
            raise ValueError('Number of actions must be > 0.')
        elif epsilon_min is not None:
            assert epsilon_min < epsilon, 'Epsilon(min) must be < epsilon(max).'
        elif capacity < batch_size:
            raise ValueError('Replay capacity must be > batch_size.')

        self.state_shape = state_shape
        self.num_actions = num_actions
        self.q_network = build_network(state_shape, num_actions)
        self.q_targets = build_network(state_shape, num_actions)
        self.epsilon = epsilon
        self.epsilon_max = epsilon  # How greedy the policy is
        self.epsilon_min = epsilon_min
        self.epsilon_iter = float(epsilon_iter)
        self.discount = discount
        self.lr = lrate
        self.batch_size = batch_size  # How many samples to draw from buffer
        self.q_update_iter = q_update_iter  # Update the q_target every C iter
        self.step = 0
        self.replay_buffer = ReplayBuffer(capacity, state_shape)

        # Build training and sampling functions
        s0_sym = nn.get_all_layers(self.q_network)[0].input_var
        s1_sym = nn.get_all_layers(self.q_targets)[0].input_var
        a_sym = T.icol('actions')  #(n, 1)
        r_sym = T.col('rewards')
        t_sym = T.col('terminal_state')
        sym_vars = [s0_sym, a_sym, r_sym, s1_sym, t_sym]

        # Training phase uses non-deterministic mapping
        loss = T.sum(self._build_loss(*sym_vars, deterministic=False))
        params = nn.get_all_params(self.q_network, trainable=True)
        updates = lasagne.updates.adam(loss, params, self.lr, beta1=0.9)

        self.train_fn = theano.function(sym_vars, loss, updates=updates)

        # Build function for sampling from DQN
        pred = nn.get_output(self.q_network, deterministic=True)
        self.pred_fn = theano.function([s0_sym], pred)
示例#2
0
    def __init__(self, state_shape, num_actions, action_scale, lr, tau):

        self.state_shape = state_shape
        self.num_actions = num_actions
        self.action_scale = action_scale
        self.tau = tau

        # Build networks, then initialize their weights to be equal
        sym_s0 = get_symbolic_var(state_shape)('s0')
        sym_s1 = get_symbolic_var(state_shape)('s1')

        self.network = self._build_network(sym_s0)
        self.targets = self._build_network(sym_s1)
        self.update_target(tau=1.0)

        # For making predictions via current and target networks
        a_pred = nn.get_output(self.network)
        self.predict_fn = theano.function([sym_s0], a_pred)
        self.target_fn = theano.function([sym_s1], nn.get_output(self.targets))

        # The policy is updated by following gradients from critic network.
        # In theano, this is done by specifying the 'known_gradients' parameter
        # in T.grad, without giving an explicit scalar cost
        action_grads = T.col('action_grads')
        known_grads = {a_pred: action_grads}

        params = nn.get_all_params(self.network, trainable=True)
        grads = [-T.grad(None, p, known_grads=known_grads) for p in params]
        updates = lasagne.updates.adam(grads, params, lr)
        train = theano.function([sym_s0, action_grads], grads, updates=updates)
        self.train_fn = train
示例#3
0
    def test_broadcast_arguments(self):
        m = Module()
        m.random = RandomStreams(utt.fetch_seed())
        low = tensor.vector()
        high = tensor.col()
        out = m.random.uniform(low=low, high=high)
        assert out.ndim == 2
        m.f = Method([low, high], out)
        made = m.make()
        made.random.initialize()

        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
        numpy_rng = numpy.random.RandomState(int(rng_seed))
        low_vals = [
                numpy.asarray([-5, .5, 0, 1], dtype=config.floatX),
                numpy.asarray([.9], dtype=config.floatX),
                numpy.asarray([-5, .5, 0, 1], dtype=config.floatX) ]
        high_vals = [
                numpy.asarray([[1.]], dtype=config.floatX),
                numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX),
                numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX) ]

        val0 = made.f(low_vals[0], high_vals[0])
        val1 = made.f(low_vals[1], high_vals[1])
        val2 = made.f(low_vals[2], high_vals[2])

        numpy_val0 = numpy_rng.uniform(low=low_vals[0], high=high_vals[0])
        numpy_val1 = numpy_rng.uniform(low=low_vals[1], high=high_vals[1])
        numpy_val2 = numpy_rng.uniform(low=low_vals[2], high=high_vals[2])

        assert numpy.allclose(val0, numpy_val0)
        assert numpy.allclose(val1, numpy_val1)
        assert numpy.allclose(val2, numpy_val2)
示例#4
0
 def get_loss_sarsa_function(self):
     #args
     self.states = T.matrix('state')
     self.actions = T.icol('action')
     self.next_states = T.matrix('next_state')
     self.next_actions = T.icol('next_action')
     self.rewards = T.col('reward')
     #q(s,a)
     actionmask = T.eq(
         T.arange(self.nactions).reshape((1, -1)),
         self.actions.reshape((-1, 1))).astype(theano.config.floatX)
     q_action = (get_output(self.network, self.states) *
                 actionmask).sum(axis=1).reshape((-1, 1))
     #q(s_next,a_next)
     next_actionmask = T.eq(
         T.arange(self.nactions).reshape((1, -1)),
         self.next_actions.reshape((-1, 1))).astype(theano.config.floatX)
     next_q_action = (get_output(self.network, self.next_states) *
                      next_actionmask).sum(axis=1).reshape((-1, 1))
     #loss = target - qvalue
     loss = (self.rewards + self.discount * next_q_action - q_action)
     #mse
     mse = 0.5 * loss**2
     #sum loss
     return T.sum(mse)
    def test_broadcast_arguments(self):
        m = Module()
        m.random = RandomStreams(utt.fetch_seed())
        low = tensor.vector()
        high = tensor.col()
        out = m.random.uniform(low=low, high=high)
        assert out.ndim == 2
        m.f = Method([low, high], out)
        made = m.make()
        made.random.initialize()

        rng_seed = numpy.random.RandomState(utt.fetch_seed()).randint(2**30)
        numpy_rng = numpy.random.RandomState(int(rng_seed))
        low_vals = [
            numpy.asarray([-5, .5, 0, 1], dtype=config.floatX),
            numpy.asarray([.9], dtype=config.floatX),
            numpy.asarray([-5, .5, 0, 1], dtype=config.floatX)
        ]
        high_vals = [
            numpy.asarray([[1.]], dtype=config.floatX),
            numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX),
            numpy.asarray([[1.], [1.1], [1.5]], dtype=config.floatX)
        ]

        val0 = made.f(low_vals[0], high_vals[0])
        val1 = made.f(low_vals[1], high_vals[1])
        val2 = made.f(low_vals[2], high_vals[2])

        numpy_val0 = numpy_rng.uniform(low=low_vals[0], high=high_vals[0])
        numpy_val1 = numpy_rng.uniform(low=low_vals[1], high=high_vals[1])
        numpy_val2 = numpy_rng.uniform(low=low_vals[2], high=high_vals[2])

        assert numpy.allclose(val0, numpy_val0)
        assert numpy.allclose(val1, numpy_val1)
        assert numpy.allclose(val2, numpy_val2)
示例#6
0
文件: net.py 项目: Levoila/CrappyAI
	def __init__(self, args):
		reward = T.col('r')
		action = T.icol('a')
		terminal = T.icol('t')
		discount = T.scalar('gamma')
		learningRate = T.scalar('lr')
		rho = T.scalar('rho')
		epsilon = T.scalar('eps')
		rng = np.random.RandomState(42)
		
		self.batchNb = args.batchSize
		
		#convLayers = [[(8,8),(4,4),64],
		#			  [(4,4),(2,2),128],
		#			  [(3,3),(1,1),256],
		#			  [(3,3),(1,1),512]]
		#fcl = [1024, 6]
		
		convLayers = [[(8,8),(4,4),64],
					  [(4,4),(2,2),128],
					  [(3,3),(1,1),256],
					  [(3,3),(1,1),256]]
		fcl = [1024, args.actionNb]
		self.q1 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
		self.q2 = NetStruct(convLayers, fcl, (4,100,100), rng, args)
		self.q2.setParams(self.q1)
		
		self.states = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
		self.states2 = theano.shared(np.zeros((args.batchSize,4,100,100), dtype='float32'))
		self.actions = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
		self.rewards = theano.shared(np.zeros((args.batchSize,1), dtype='float32'), broadcastable=(False,True))
		self.terminals = theano.shared(np.zeros((args.batchSize,1), dtype='int32'), broadcastable=(False,True))
		
		self.learningRate = theano.shared(np.array(args.learningRate, dtype='float32'))
		self.rho = theano.shared(np.array(args.rmsPropRho, dtype='float32'))
		self.epsilon = theano.shared(np.array(args.rmsPropEpsilon, dtype='float32'))
		self.discount = theano.shared(np.array(args.discountFactor, dtype='float32'))
		
		loss = self.QLoss(self.q1.output, self.q2.output, action, reward, terminal, discount)
		
		params = self.q1.getParams()
		
		updates = self.rmsProp(loss, params, rho, epsilon, learningRate)
		self.train_model = theano.function(
			[],
			loss,
			updates=updates,
			givens = { 
					   self.q1.input: self.states,
					   self.q2.input: self.states2,
					   action: self.actions,
					   reward: self.rewards,
					   terminal: self.terminals,
					   discount: self.discount,
					   learningRate: self.learningRate,
					   rho: self.rho,
					   epsilon: self.epsilon
					 }
		)
    def createGradientFunctions(self):
        #Create the Theano variables
        W1,W2,W3,W4,W5,W6,x,eps = T.dmatrices("W1","W2","W3","W4","W5","W6","x","eps")
        #Create biases as cols so they can be broadcasted for minibatches
        b1,b2,b3,b4,b5,b6 = T.dcols("b1","b2","b3","b4","b5","b6")
        z1 = T.col("z1")
        if self.continuous:
            #convolve x
            # no_filters = 100, stride = 4, filter_size = 50

            h_encoder = T.tanh(T.dot(W1,x) + b1)
            #h_encoder = T.dot(W1,x) + b1
        else:   
            h_encoder = T.tanh(T.dot(W1,x) + b1)

        mu_encoder = T.dot(W2,h_encoder) + b2
        log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3)

        mu_encoder = T.dot(W2,h_encoder) + b2 
        log_sigma_encoder = 0.5*(T.dot(W3,h_encoder) + b3)

        #Find the hidden variable z
        z = mu_encoder + T.exp(log_sigma_encoder)*eps

        prior = 0.5* T.sum(1 + 2*log_sigma_encoder - mu_encoder**2 - T.exp(2*log_sigma_encoder))


        #Set up decoding layer
        if self.continuous:
            h_decoder = T.nnet.softplus(T.dot(W4,z) + b4)
            h_dec = T.nnet.softplus(T.dot(W4,z1) + b4)

            #h_decoder = T.dot(W4,z) + b4
            #h_dec = T.dot(W4,z1) + b4

            mu_decoder = T.tanh(T.dot(W5,h_decoder) + b5)
            mu_dec = T.tanh(T.dot(W5,h_dec) + b5)
            log_sigma_decoder = 0.5*(T.dot(W6,h_decoder) + b6)
            logpxz = T.sum(-(0.5 * np.log(2 * np.pi) + log_sigma_decoder) - 0.5 * ((x - mu_decoder) / T.exp(log_sigma_decoder))**2)
            gradvariables = [W1,W2,W3,W4,W5,W6,b1,b2,b3,b4,b5,b6]
        else:
            h_decoder = T.tanh(T.dot(W4,z) + b4)
            y = T.nnet.sigmoid(T.dot(W5,h_decoder) + b5)
            logpxz = -T.nnet.binary_crossentropy(y,x).sum()
            gradvariables = [W1,W2,W3,W4,W5,b1,b2,b3,b4,b5]
        logp = logpxz + prior

        #Compute all the gradients
        derivatives = T.grad(logp,gradvariables)

        #Add the lowerbound so we can keep track of results
        derivatives.append(logp)
        
        self.get_z = th.function(gradvariables+[x,eps],z,on_unused_input='ignore')
        self.generate = th.function(gradvariables+[z1,x,eps],mu_dec,on_unused_input='ignore')
        self.predict = th.function(gradvariables+[x,eps],mu_decoder,on_unused_input='ignore')
        self.gradientfunction = th.function(gradvariables + [x,eps], derivatives, on_unused_input='ignore')
        self.lowerboundfunction = th.function(gradvariables + [x,eps], logp, on_unused_input='ignore')
示例#8
0
    def test_ndim_mismatch(self):
        rng = numpy.random.RandomState(utt.fetch_seed())
        data = rng.rand(5).astype(self.dtype)
        x = self.shared(data)
        y = tensor.col('y', self.dtype)
        cond = theano.tensor.iscalar('cond')

        self.assertRaises(TypeError, ifelse, cond, x, y)
        self.assertRaises(TypeError, ifelse, cond, y, x)
示例#9
0
 def testDataSet(self, dataSet_, dataLabels_):
     dataSet = T.matrix("dataSet")
     labels = T.col("labels")
     svLabels = T.col("svLabels")
     gamma = T.dscalar("gamma")
     svs = T.matrix("supportVectors")
     svAlphas = T.matrix("svAlphas")
     b = T.dscalar("b")
           
     # we need to transpose the result because the results of the per-row actions are usually columns
     errorVec = theano.scan(lambda row, realLabel : self.testDataSet_inner_(svs, row, gamma, svLabels, svAlphas, b, realLabel), sequences=[dataSet, labels])[0]
     errors = T.sum(errorVec)
     
     inputs = [dataSet, labels, svs, svLabels, gamma, svAlphas, b]
     compErrors = theano.function(inputs=inputs, outputs=errors, on_unused_input='ignore')
     
     gamma_ = 1/(-1*self.Training.UsedKernel[1]**2)
     numErrors = compErrors(dataSet_, dataLabels_, self.Training.SupportVectors, self.Training.SVLabels, gamma_, self.Training.Alphas[self.Training.SVIndices], self.Training.B.item(0))
     return float(numErrors) / float(dataSet_.shape[0])
示例#10
0
 def __init__(self):
     self.dt=1
     self.xdim=1
     self.udim=1
     self.r=1
     self.delay=T.bscalar()
     self.delay2=T.bscalar()
     self.x=T.matrix()
     self.u=T.col()
     self.xu_flat=T.concatenate([T.flatten(self.x.T),T.flatten(self.u)])
示例#11
0
文件: SMH.py 项目: utunga/hashmapd
    def __init__(self, numpy_rng, theano_rng = None, first_layer_type = 'bernoulli', mean_doc_size = 1, n_ins = 784, mid_layer_sizes=[200], inner_code_length = 10):
        """This class is made to support a variable number of layers. 

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial 
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is 
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input (and autoencoder output, y) of the SMH 

        :type n_code_length: int
        :param n_code_length: how many codes to squash down to in the middle layer
        """
        
        self.first_layer_type = first_layer_type;
        self.mean_doc_size = mean_doc_size;
        
        self.sigmoid_layers = []
        self.rbm_layers     = []
        self.params         = []
        
        self.n_ins = n_ins
        self.inner_code_length = inner_code_length
        self.mid_layer_sizes = list(mid_layer_sizes)
        
        self.numpy_rng = numpy_rng
        self.theano_rng = RandomStreams(numpy_rng.randint(2**30))
     
        # allocate symbolic variables for the data
        
        if (theano.config.floatX == "float32"):
            self.x  = T.matrix('x')  #
            self.x_sums = T.col('x_sums')
            self.y  = T.matrix('y') # the output (after finetuning) should /look the same as the input
        else:
            if (theano.config.floatX == "float64"):
                self.x  = T.dmatrix('x')  #
                self.x_sums = T.dcol('x_sums')
                self.y  = T.dmatrix('y') # the output (after finetuning) should look the same as the input
            else:        
                raise Exception #not sure whats up here..

        # The SMH is an MLP, for which all weights of intermediate layers are shared with a
        # different RBM.  We will first construct the SMH as a deep multilayer perceptron, and
        # when constructing each sigmoidal layer we also construct an RBM that shares weights
        # with that layer. During pretraining we will train these RBMs (which will lead
        # to chainging the weights of the MLP as well) During finetuning we will finish
        # training the SMH by doing stochastic gradient descent on the MLP.

        self.init_layers()
示例#12
0
文件: network.py 项目: npow/deep_q_rl
    def __init__(self, input_width, input_height, output_dim, num_frames, batch_size):
        self.input_width = input_width
        self.input_height = input_height
        self.output_dim = output_dim
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.gamma = 0.99 # discount factor
        self.rho = 0.99
        self.lr = 0.00025 # learning rate
        self.momentum = 0.95
        self.freeze_targets = True

        self.l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
        if self.freeze_targets:
            self.next_l_out = self.build_network(input_width, input_height, output_dim, num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
#        terminals = T.icol('terminals')

        self.states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(np.zeros((batch_size, num_frames, input_height, input_width), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), broadcastable=(False,True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))
#        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'), broadcastable=(False,True))

        q_vals = self.l_out.get_output(states / 255.0)
        if self.freeze_targets:
            next_q_vals = self.next_l_out.get_output(next_states / 255.0)
        else:
            next_q_vals = self.l_out.get_output(next_states / 255.0)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = rewards + self.gamma * T.max(next_q_vals, axis=1, keepdims=True)
        diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1,1))
        loss = T.mean(diff ** 2)

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
#            terminals: self.terminals_shared
        }
        if self.momentum > 0:
            updates = rmsprop_nesterov(loss, params, self.lr, self.rho, self.momentum, 1e-2)
        else:
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho, 1e-6)
        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._q_vals = theano.function([], q_vals, givens={ states: self.states_shared })
示例#13
0
    def test_ndim_mismatch(self):
        rng = np.random.RandomState(utt.fetch_seed())
        data = rng.rand(5).astype(self.dtype)
        x = self.shared(data)
        y = tensor.col("y", self.dtype)
        cond = theano.tensor.iscalar("cond")

        with pytest.raises(TypeError):
            ifelse(cond, x, y)
        with pytest.raises(TypeError):
            ifelse(cond, y, x)
示例#14
0
def neural_tensor_network():
    # tensor params
    subj = T.col('e_1')
    targets = T.matrix('e_2')
    W = T.tensor3('W')

    # neural net params
    u = T.col('u')
    V = T.matrix('V')
    b = T.col('b')

    # tensor
    h = subj.T.dot(W).dot(targets)

    # neural net
    d = subj.shape[0]
    V_subj = V[:, :d].dot(subj)
    V_targ = V[:, d:].dot(targets)

    activations = T.tanh(h + V_subj + V_targ + b)
    score = u.T.dot(activations).reshape((-1, 1))

    margins = score[0] - score[1:]
    cost = T.min(T.concatenate((T.ones_like(margins), margins), axis=1), axis=1).mean()

    gsubj, gtargets, gW, gu, gV, gb = T.grad(cost, [subj, targets, W, u, V, b])

    print 'Compiling NTN score'
    score = theano.function([subj, W, targets, u, V, b], score, name='NTN Score',
                            mode='FAST_RUN')

    print 'Compiling NTN fprop'
    fprop = theano.function([subj, W, targets, u, V, b], cost, name='NTN fprop',
                            mode='FAST_RUN')

    print 'Compiling NTN bprop'
    bprop = theano.function([subj, W, targets, u, V, b],
                            outputs=[gsubj, gW, gtargets, gu, gV, gb],
                            name='NTN bprop', mode='FAST_RUN')

    return {'score': score, 'fprop': fprop, 'bprop': bprop}
示例#15
0
文件: SMH.py 项目: utunga/hashmapd
 def build_finetune_functions(self, batch_size, learning_rate):
     '''Generates a function `train` that implements one step of finetuning, a function
     `validate` that computes the error on a batch from the validation set, and a function
     `test` that computes the error on a batch from the testing set
     
     :type batch_size: int
     :param batch_size: size of a minibatch
     :type learning_rate: float
     :param learning_rate: learning rate used during finetune stage
     '''
     
     train_set_x = T.matrix('train_set_x')
     train_set_x_sums = T.col('train_set_x_sums')
     valid_set_x = T.matrix('valid_set_x')
     valid_set_x_sums = T.col('valid_set_x_sums')
     test_set_x = T.matrix('test_set_x')
     test_set_x_sums = T.col('test_set_x_sums')
     
     # compute the gradients with respect to the model parameters
     gparams = T.grad(self.finetune_cost, self.params)
     
     # compute list of fine-tuning updates
     updates = {}
     for param, gparam in zip(self.params, gparams):
         updates[param] = param - gparam*learning_rate
     
     train_fn = theano.function(inputs = [train_set_x, train_set_x_sums], 
           outputs =  self.finetune_cost, 
           updates = updates,
           givens  = { self.x : train_set_x,
                       self.x_sums : train_set_x_sums })
     
     valid_score_i = theano.function([valid_set_x, valid_set_x_sums], self.finetune_cost,
           givens  = { self.x : valid_set_x,
                       self.x_sums : valid_set_x_sums })
     
     test_score_i = theano.function([test_set_x, test_set_x_sums], self.finetune_cost,
           givens  = { self.x : test_set_x,
                       self.x_sums : test_set_x_sums })
     
     return train_fn, valid_score_i, test_score_i
示例#16
0
    def setup_theano(self):
        # for numpy optimization
        oneCol = T.col("oneCol")
        pi_t = T.col("pi_t")
        z_t = T.col("z_t")
        z_t1 = z_t.reshape((self.numKeypoints, 2))
        pts = T.concatenate((z_t1, oneCol), axis=1)
        A_t_ = T.matrix("A_t_")
        r_t_ = T.dot(A_t_, pts.transpose()).transpose()
        r_t1_ = r_t_[:, 0:2].reshape((2 * self.numKeypoints, 1))

        diff_ = pi_t * (r_t1_ - self.mu)
        difft_ = diff_.reshape((1, 2 * self.numKeypoints))

        cost_1 = T.dot(difft_, diff_)
        # cost_1 = theano.printing.Print('cost is:')(cost_1)
        cost_ = T.max(cost_1)

        A_t_grad_ = T.grad(cost=cost_, wrt=A_t_)
        A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2, :], 0)
        self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol], outputs=[cost_, A_t_grad_])
示例#17
0
    def __init__(self, state_shape, num_actions, discount, lr, tau, l2_decay):

        self.state_shape = state_shape
        self.num_actions = num_actions
        self.discount = discount
        self.tau = tau

        # Initialize some symbolic variables to interface with graphs easier
        sym_s0 = get_symbolic_var(state_shape)('s0')
        sym_a0 = T.col('policy_actions')
        sym_s1 = get_symbolic_var(state_shape)('s1')
        sym_a1 = T.col('target_actions')
        sym_r = T.col('rewards')
        sym_t = T.col('terminal_state')
        sym_vars = [sym_s0, sym_a0, sym_s1, sym_a1, sym_r, sym_t]

        self.network = self._build_network(sym_s0, sym_a0)
        self.targets = self._build_network(sym_s1, sym_a1)
        self.update_target(tau=1.0)

        # Functions for sampling from current and target Q-functions
        q_pred = nn.get_output(self.network)
        q_target = nn.get_output(self.targets)

        self.predict_fn = theano.function([sym_s0, sym_a0], q_pred)
        self.target_fn = theano.function([sym_s1, sym_a1], q_target)

        # Calculate action gradients for updating actor / policy
        grads = T.grad(T.mean(q_pred), sym_a0)
        self.action_grads = theano.function([sym_s0, sym_a0], grads)

        # Build critic training function; loss is similar to DQN, where
        # it's the mean squared error between Q and target Q values
        yi = sym_r + (1. - sym_t) * self.discount * q_target
        loss = T.mean(T.sqr(yi - q_pred))
        loss += regularize_network_params(self.network, l2) * l2_decay

        params = nn.get_all_params(self.network, trainable=True)
        updates = lasagne.updates.adam(loss, params, lr)
        self.train_fn = theano.function(sym_vars, loss, updates=updates)
示例#18
0
   def setup_theano(self):
      #for numpy optimization
      oneCol = T.col('oneCol')
      pi_t = T.col('pi_t')
      z_t = T.col('z_t')
      z_t1 = z_t.reshape((self.numKeypoints, 2))
      pts = T.concatenate((z_t1, oneCol), axis=1)
      A_t_ = T.matrix('A_t_')
      r_t_ = T.dot(A_t_, pts.transpose()).transpose()
      r_t1_ = r_t_[:,0:2].reshape((2*self.numKeypoints,1))

      diff_ = pi_t * (r_t1_ - self.mu)
      difft_ = diff_.reshape((1, 2 * self.numKeypoints))
      
      cost_1 = T.dot(difft_,diff_)
      #cost_1 = theano.printing.Print('cost is:')(cost_1)
      cost_ = T.max(cost_1)
   
      A_t_grad_ = T.grad(cost=cost_, wrt=A_t_)
      A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2,:],0)
      self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol],
                                  outputs=[cost_, A_t_grad_])
示例#19
0
文件: SMH.py 项目: utunga/hashmapd
    def pretraining_functions(self, batch_size, method, pretrain_lr, k):
        ''' Generates a list of functions, for performing one step of gradient descent at a
        given layer. The function will require as input a minibatch of data, and to train an
        RBM you just need to iterate, calling the corresponding function on all minibatches.
        
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :type method: string
        :param method: type of Gibbs sampling to perform: 'cd' (default) or 'pcd'
        :type k: int
        :param k: number of Gibbs steps to do in CD-k / PCD-k
        ;type finetune_lr: float
        ;param finetune_lr: the 'learning rate' to use during finetuning phase
        '''

        learning_rate = T.scalar('lr')    # learning rate to use
        #learning_rate.value = pretrain_lr

        # i *think* the following is equivalent to above.. doing this because i can't see where lr gets a value at all
        #learning_rate = theano.shared(pretrain_lr, 'learning_rate')
        train_set_x = T.matrix('train_set_x')
        train_set_x_sums = T.col('train_set_x_sums')

        pretrain_fns = []
        for rbm in self.rbm_layers:
            if method == 'pcd':
                # initialize storage for the persistent chain (state = hidden layer of chain)
                persistent_chain = theano.shared(numpy.zeros((batch_size,rbm.n_hidden),dtype=theano.config.floatX))
                # get the cost and the gradient corresponding to one step of PCD-k
                cost,updates = rbm.get_cost_updates(lr=learning_rate, persistent=persistent_chain, k=k)
            else:
                # default = use CD instead
                cost,updates = rbm.get_cost_updates(lr=learning_rate)
            
            # compile the theano function    
            fn = theano.function(inputs = [train_set_x,train_set_x_sums,
                        theano.Param(learning_rate, default = 0.1)],
                    outputs = cost,
                    updates = updates,
                    givens  = {self.x:train_set_x,
                               self.x_sums:train_set_x_sums}
                    # uncomment the following line to perform debugging:
                    #   ,mode=theano.compile.debugmode.DebugMode(stability_patience=5)
                    )
            
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns
示例#20
0
def transE_model():
    '''
        Note X_S is a column and X_T is a matrix so that broadcasting occurs
        across the columns of X_T (this allows batching X_T with negatives,
        for example.
    '''
    # construct theano expression graph
    X_s = T.col('X_s')
    W = T.matrix('W')
    X_t = T.matrix('X_t')

    rels = W[:, :, None].transpose(1, 0, 2)

    # Computes x_{r_1} + x_{r_{2}} + ... + x_{r_n} - X_{t}
    results, updates = theano.scan(fn=lambda rel, v: rel + v,
                                   outputs_info=-X_t, sequences=[rels])

    # score is always a column vector
    score = T.sum((X_s + results[-1]) ** 2, axis=0).reshape((-1, 1))

    margins = 1. + score[0] - score[1:]

    # zero out negative entries
    pos_parts = margins * (margins > 0)

    # we are using online Maximizer, so the objective is negated
    cost = -pos_parts.mean()

    gX_s, gW, gX_t = T.grad(cost, [X_s, W, X_t])

    print 'Compiling TransE score'
    # return negative score since this is a ranking
    score = theano.function([X_s, W, X_t], -score, name='transE Score',
                            mode='FAST_RUN')
    score.trust_input = True

    print 'Compiling TransE fprop'
    fprop = theano.function([X_s, W, X_t], cost, name='transE fprop',
                            mode='FAST_RUN')
    fprop.trust_input = True

    print 'Compiling TransE bprop'
    bprop = theano.function([X_s, W, X_t],
                            outputs=[gX_s, gW, gX_t],
                            name='transE bprop', mode='FAST_RUN')
    bprop.trust_input = True

    return {'score': score, 'fprop': fprop, 'bprop': bprop}
示例#21
0
    def test_wrong_broadcast(self):
        a = tt.col()
        increment = tt.vector()

        # These symbolic graphs legitimate, as long as increment has exactly
        # one element. So it should fail at runtime, not at compile time.
        rng = numpy.random.RandomState(utt.fetch_seed())

        def rng_randX(*shape):
            return rng.rand(*shape).astype(theano.config.floatX)

        for op in (tt.set_subtensor, tt.inc_subtensor):
            for base in (a[:], a[0]):
                out = op(base, increment)
                f = theano.function([a, increment], out)
                # This one should work
                f(rng_randX(3, 1), rng_randX(1))
                # These ones should not
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(2))
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(3))
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(0))
示例#22
0
    def test_wrong_broadcast(self):
        a = tt.col()
        increment = tt.vector()

        # These symbolic graphs legitimate, as long as increment has exactly
        # one element. So it should fail at runtime, not at compile time.
        rng = numpy.random.RandomState(utt.fetch_seed())

        def rng_randX(*shape):
            return rng.rand(*shape).astype(theano.config.floatX)

        for op in (tt.set_subtensor, tt.inc_subtensor):
            for base in (a[:], a[0]):
                out = op(base, increment)
                f = theano.function([a, increment], out)
                # This one should work
                f(rng_randX(3, 1), rng_randX(1))
                # These ones should not
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(2))
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(3))
                self.assertRaises(ValueError, f, rng_randX(3, 1), rng_randX(0))
示例#23
0
    def __init__(self, env, args, rng, name = "DQNLasagne"):
        """ Initializes a network based on the Lasagne Theano framework.

        Args:
            env (AtariEnv): The envirnoment in which the agent actuates.
            args (argparse.Namespace): All settings either with a default value or set via command line arguments.
            rng (mtrand.RandomState): Initialized Mersenne Twister pseudo-random number generator.
            name (str): The name of the network object.

        Note:
            This function should always call the base class first to initialize
            the common values for the networks.
        """
        _logger.info("Initialize object of type " + str(type(self).__name__))
        super(DQNLasagne, self).__init__(env, args, rng, name)
        self.input_shape = (self.batch_size, self.sequence_length, args.frame_width, args.frame_height)
        self.dummy_batch = np.zeros(self.input_shape, dtype=np.uint8)
        lasagne.random.set_rng(self.rng)

        self.network = self._create_layer()

        # TODO: Load weights from pretrained network?!
        if not self.args.load_weights == None:
            self.load_weights(self.args.load_weights)

        if self.target_update_frequency > 0:
            self.target_network = self._create_layer()
            self._copy_theta()

        states = T.tensor4('states')
        followup_states = T.tensor4('followup_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
                np.zeros(self.input_shape, dtype=theano.config.floatX)
        )
        self.followup_states_shared = theano.shared(
                np.zeros(self.input_shape, dtype=theano.config.floatX)
        )
        self.rewards_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
                broadcastable=(False, True)
        )
        self.actions_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype='int32'),
                broadcastable=(False, True)
        )
        self.terminals_shared = theano.shared(
                np.zeros((self.batch_size, 1), dtype='int32'),
                broadcastable=(False, True)
        )

        qvalues = lasagne.layers.get_output(
                self.network,
                self._prepare_network_input(states)
        )

        if self.target_update_frequency > 0:
            qvalues_followup_states = lasagne.layers.get_output(
                    self.target_network,
                    self._prepare_network_input(followup_states)
            )
        else:
            qvalues_followup_states = lasagne.layers.get_output(
                    self.network,
                    self._prepare_network_input(followup_states)
            )
            qvalues_followup_states = theano.gradient.disconnected_grad(qvalues_followup_states)

        targets = (rewards +
                (T.ones_like(terminals) - terminals) *
                self.discount_rate *
                T.max(qvalues_followup_states, axis=1, keepdims=True)
        )
        errors = targets - qvalues[
                T.arange(self.batch_size),
                actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_error > 0:
            quadratic_part = T.minimum(abs(errors), self.clip_error)
            linear_part = abs(errors) - quadratic_part
            cost_function = T.sum(0.5 * quadratic_part ** 2 + self.clip_error * linear_part)
        else:
            cost_function = T.sum(0.5 * errors ** 2)

        self.params = lasagne.layers.helper.get_all_params(self.network)
        self.observations = {
            states: self.states_shared,
            followup_states: self.followup_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        self._set_optimizer(cost_function)

        if self.momentum > 0:
            self.optimizer = lasagne.updates.apply_momentum(
                    self.optimizer,
                    None,
                    self.momentum
            )
        _logger.debug("Compiling _theano_train")
        self._theano_train = theano.function(
                [],
                [cost_function, qvalues],
                updates=self.optimizer,
                givens=self.observations)
        _logger.debug("Compiling _theano_get_Q")
        self._theano_get_Q = theano.function(
                [],
                qvalues,
                givens={states: self.states_shared})

        self.callback = None
        _logger.debug("%s" % self)
示例#24
0
    def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound,
                 settings_):

        super(DeepNNDropoutCritic,
              self).__init__(n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)
        self._dropout_p = settings_['dropout_p']

        # data types for model
        self._State = T.matrix("State")
        self._State.tag.test_value = np.random.rand(self._batch_size,
                                                    self._state_length)
        self._ResultState = T.matrix("ResultState")
        self._ResultState.tag.test_value = np.random.rand(
            self._batch_size, self._state_length)
        self._Reward = T.col("Reward")
        self._Reward.tag.test_value = np.random.rand(self._batch_size, 1)
        self._Action = T.matrix("Action")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)
        # create a small convolutional neural network
        input = lasagne.layers.InputLayer((None, self._state_length),
                                          self._State)
        self._stateInputVar = input.input_var
        # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        """
        network = lasagne.layers.DenseLayer(
                network, num_units=256,
                nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        """
        network = lasagne.layers.DenseLayer(
            input,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=16,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)

        self._critic = lasagne.layers.DenseLayer(
            network, num_units=1, nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))
        # networkAct = lasagne.layers.InputLayer((None, self._state_length), self._State)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)
        """
        networkAct = lasagne.layers.DenseLayer(
                networkAct, num_units=256,
                nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        """
        networkAct = lasagne.layers.DenseLayer(
            input,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        networkAct = lasagne.layers.DenseLayer(
            networkAct,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        networkAct = lasagne.layers.DenseLayer(
            networkAct,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        self._actor = lasagne.layers.DenseLayer(
            networkAct,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))

        # print "Initial W " + str(self._w_o.get_value())

        self._states_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=theano.config.floatX), )
示例#25
0
    def __init__(self, n_in, n_out):

        batch_size = 32
        state_length = n_in
        action_length = n_out
        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, state_length)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, state_length)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.dmatrix("Action")
        Action.tag.test_value = np.random.rand(batch_size, action_length)
        # create a small convolutional neural network
        inputLayerA = lasagne.layers.InputLayer((None, state_length), State)

        l_hid1A = lasagne.layers.DenseLayer(
            inputLayerA,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.rectify)

        l_hid2A = lasagne.layers.DenseLayer(
            l_hid1A, num_units=64, nonlinearity=lasagne.nonlinearities.rectify)

        l_hid3A = lasagne.layers.DenseLayer(
            l_hid2A, num_units=32, nonlinearity=lasagne.nonlinearities.rectify)

        self._l_outA = lasagne.layers.DenseLayer(
            l_hid3A, num_units=1, nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))
        inputLayerActA = lasagne.layers.InputLayer((None, state_length), State)

        l_hid1ActA = lasagne.layers.DenseLayer(
            inputLayerActA,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActA = lasagne.layers.DenseLayer(
            l_hid1ActA,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActA = lasagne.layers.DenseLayer(
            l_hid2ActA,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActA = lasagne.layers.DenseLayer(
            l_hid3ActA,
            num_units=action_length,
            nonlinearity=lasagne.nonlinearities.linear)
        # self._b_o = init_b_weights((n_out,))

        # self.updateTargetModel()
        inputLayerB = lasagne.layers.InputLayer((None, state_length), State)

        l_hid1B = lasagne.layers.DenseLayer(
            inputLayerB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.rectify)

        l_hid2B = lasagne.layers.DenseLayer(
            l_hid1B, num_units=64, nonlinearity=lasagne.nonlinearities.rectify)

        l_hid3B = lasagne.layers.DenseLayer(
            l_hid2B, num_units=32, nonlinearity=lasagne.nonlinearities.rectify)

        self._l_outB = lasagne.layers.DenseLayer(
            l_hid3B, num_units=1, nonlinearity=lasagne.nonlinearities.linear)

        inputLayerActB = lasagne.layers.InputLayer((None, state_length), State)

        l_hid1ActB = lasagne.layers.DenseLayer(
            inputLayerActB,
            num_units=128,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid2ActB = lasagne.layers.DenseLayer(
            l_hid1ActB,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        l_hid3ActB = lasagne.layers.DenseLayer(
            l_hid2ActB,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)

        self._l_outActB = lasagne.layers.DenseLayer(
            l_hid3ActB,
            num_units=n_out,
            nonlinearity=lasagne.nonlinearities.linear)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.001
        self._discount_factor = 0.8
        self._rho = 0.95
        self._rms_epsilon = 0.001

        self._weight_update_steps = 5000
        self._updates = 0

        self._states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, state_length), dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._actions_shared = theano.shared(
            np.zeros((batch_size, n_out), dtype=theano.config.floatX), )

        self._q_valsA = lasagne.layers.get_output(self._l_outA, State)
        self._q_valsB = lasagne.layers.get_output(self._l_outB, ResultState)

        self._q_valsActA = lasagne.layers.get_output(self._l_outActA, State)
        self._q_valsActB = lasagne.layers.get_output(self._l_outActB, State)

        self._q_func = self._q_valsA
        self._q_funcAct = self._q_valsActA
        # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True)

        target = (Reward + self._discount_factor * self._q_valsB)
        diff = target - self._q_valsA
        loss = 0.5 * diff**2 + (
            1e-6 * lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2))
        loss = T.mean(loss)

        params = lasagne.layers.helper.get_all_params(self._l_outA)
        actionParams = lasagne.layers.helper.get_all_params(self._l_outActA)
        givens_ = {
            State: self._states_shared,
            ResultState: self._next_states_shared,
            Reward: self._rewards_shared,
            # Action: self._actions_shared,
        }
        actGivens = {
            State: self._states_shared,
            # ResultState: self._next_states_shared,
            # Reward: self._rewards_shared,
            Action: self._actions_shared,
        }

        # SGD update
        #updates_ = lasagne.updates.rmsprop(loss, params, self._learning_rate, self._rho,
        #                                    self._rms_epsilon)
        # TD update
        updates_ = lasagne.updates.rmsprop(
            T.mean(self._q_func) +
            (1e-6 * lasagne.regularization.regularize_network_params(
                self._l_outA, lasagne.regularization.l2)), params,
            self._learning_rate * -T.mean(diff), self._rho, self._rms_epsilon)

        # actDiff1 = (Action - self._q_valsActB) #TODO is this correct?
        # actDiff = (actDiff1 - (Action - self._q_valsActA))
        actDiff = ((Action - self._q_valsActA)
                   )  # Target network does not work well here?
        actLoss = 0.5 * actDiff**2 + (
            1e-4 * lasagne.regularization.regularize_network_params(
                self._l_outActA, lasagne.regularization.l2))
        actLoss = T.sum(actLoss) / float(batch_size)

        # actionUpdates = lasagne.updates.rmsprop(actLoss +
        #    (1e-4 * lasagne.regularization.regularize_network_params(
        #        self._l_outActA, lasagne.regularization.l2)), actionParams,
        #            self._learning_rate * 0.01 * (-actLoss), self._rho, self._rms_epsilon)

        actionUpdates = lasagne.updates.rmsprop(
            T.mean(self._q_funcAct) +
            (1e-4 * lasagne.regularization.regularize_network_params(
                self._l_outActA, lasagne.regularization.l2)), actionParams,
            self._learning_rate * 0.5 * (-T.sum(actDiff) / float(batch_size)),
            self._rho, self._rms_epsilon)

        self._train = theano.function([], [loss, self._q_valsA],
                                      updates=updates_,
                                      givens=givens_)
        self._trainActor = theano.function([], [actLoss, self._q_valsActA],
                                           updates=actionUpdates,
                                           givens=actGivens)
        self._q_val = theano.function([],
                                      self._q_valsA,
                                      givens={State: self._states_shared})
        self._q_action = theano.function([],
                                         self._q_valsActA,
                                         givens={State: self._states_shared})
        self._bellman_error = theano.function(
            inputs=[State, Reward, ResultState],
            outputs=diff,
            allow_input_downcast=True)
示例#26
0
    def __init__(self,
                 num_actions,
                 phi_length,
                 width,
                 height,
                 discount=.9,
                 learning_rate=.01,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.scale_input_by = 255.0

        print "neural net initialization, lr is: ", self.learning_rate, approximator

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(
            layers.Input2DLayer(self._batch_size, self._num_input_features,
                                self._img_height, self._img_width,
                                self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(
                cc_layers.ShuffleBC01ToC01BLayer(self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.ShuffleC01BToBC01Layer(self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=16,
                                          filter_width=8,
                                          filter_height=8,
                                          stride_x=4,
                                          stride_y=4,
                                          weights_std=.01,
                                          init_bias_value=0.01))

            self.q_layers.append(
                layers.StridedConv2DLayer(self.q_layers[-1],
                                          n_filters=32,
                                          filter_width=4,
                                          filter_height=4,
                                          stride_x=2,
                                          stride_y=2,
                                          weights_std=.01,
                                          init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=256,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))

        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))

        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers) - 1):
            print self.q_layers[i].get_output_shape()

        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        print "building loss funtion"
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked**2)
        self._loss = error * diff_masked.shape[1]  #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros(
            (1, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((1, 1), dtype='int32'),
                                            broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
            self._loss, self._parameters, learning_rate=self.learning_rate,
            rho=0.9, momentum=0.9, epsilon=1e-6)

        self._train = theano.function([self._idx],
                                      self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
示例#27
0
文件: network.py 项目: hercky/a3c
    def __init__(self, num_actions):
        
        # remember parameters
        self.num_actions = num_actions
        self.batch_size = BATCH_SIZE
        self.discount_rate = DISCOUNT_RATE
        self.history_length = HISTORY_LENGTH
        self.screen_dim = DIMS
        self.img_height = SCREEN_HEIGHT
        self.img_width = SCREEN_WIDTH
        self.clip_error = CLIP_ERROR
        self.input_color_scale = COLOR_SCALE

        self.target_steps = TARGET_STEPS
        self.train_iterations = TRAIN_STEPS
        self.train_counter = 0
        self.momentum = MOMENTUM
        self.update_rule = UPDATE_RULE
        self.learning_rate = LEARNING_RATE
        self.rms_decay = RMS_DECAY
        self.rms_epsilon = RMS_EPSILON        
        
        self.rng = np.random.RandomState(RANDOM_SEED)

        # set seed
        lasagne.random.set_rng(self.rng)

        # prepare tensors once and reuse them
        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        # terminals are bool for our case
        terminals = T.bcol('terminals')

        # create shared theano variables
        self.states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((self.batch_size, self.history_length, self.img_height, self.img_width),
                     dtype=theano.config.floatX))

        # !broadcast ?
        self.rewards_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((self.batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            #np.zeros((self.batch_size, 1), dtype='int32'),
            np.zeros((self.batch_size, 1), dtype='int8'),
            broadcastable=(False, True))

        # can add multiple nets here
        self.l_primary = self.build_network()

        if self.target_steps > 0:
            self.l_secondary = self.build_network()
            self.copy_to_secondary()

        
        """
        # input scale i.e. division can be applied to input directly also to normalize
        """

        # define output symbols
        q_vals = lasagne.layers.get_output(self.l_primary, states / self.input_color_scale)
        
        if self.target_steps > 0:
            q_vals_secondary = lasagne.layers.get_output(self.l_secondary, next_states / self.input_color_scale)
        else:
            # why this ?
            q_vals_secondary = lasagne.layers.get_output(self.l_primary, next_states / self.input_color_scale)
            q_vals_secondary = theano.gradient.disconnected_grad(q_vals_secondary)

        # target = r + max
        target = (rewards + (T.ones_like(terminals) - terminals) * self.discount_rate * T.max(q_vals_secondary, axis=1, keepdims=True))
        
        """
        # check what this does
        """
        diff = target - q_vals[T.arange(self.batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        # print shape ? 

        if self.clip_error > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_error)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_error * linear_part
        else:
            loss = 0.5 * diff ** 2

        loss = T.sum(loss)
        
        params = lasagne.layers.helper.get_all_params(self.l_primary)  
        
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }

        g_time = time.time()
        logger.info("graph compiling")


        if self.update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                       self.rms_epsilon)
        elif self.update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.learning_rate, self.rms_decay,
                                              self.rms_epsilon)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})

        logger.info("Theano Graph Compiled !! %f", time.time() - g_time)
示例#28
0
    def __init__(self, num_actions, phi_length, width, height,
                 discount, learning_rate, decay, momentum=0,
                 batch_size=32,
                 approximator='none'):
        self._batch_size = batch_size
        self._num_input_features = phi_length
        self._phi_length = phi_length
        self._img_width = width
        self._img_height = height
        self._discount = discount
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.decay = decay
        self.momentum = momentum
        self.scale_input_by = 255.0

        # CONSTRUCT THE LAYERS
        self.q_layers = []
        self.q_layers.append(layers.Input2DLayer(self._batch_size,
                                               self._num_input_features,
                                               self._img_height,
                                               self._img_width,
                                               self.scale_input_by))

        if approximator == 'cuda_conv':
            self.q_layers.append(cc_layers.ShuffleBC01ToC01BLayer(
                    self.q_layers[-1]))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=16,
                                                 filter_size=8,
                                                 stride=4,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(
                cc_layers.CudaConvnetConv2DLayer(self.q_layers[-1],
                                                 n_filters=32,
                                                 filter_size=4,
                                                 stride=2,
                                                 weights_std=.01,
                                                 init_bias_value=0.1))
            self.q_layers.append(cc_layers.ShuffleC01BToBC01Layer(
                    self.q_layers[-1]))

        elif approximator == 'conv':
            self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1],
                                                         n_filters=16,
                                                         filter_width=8,
                                                         filter_height=8,
                                                         stride_x=4,
                                                         stride_y=4,
                                                         weights_std=.01,
                                                         init_bias_value=0.01))

            self.q_layers.append(layers.StridedConv2DLayer(self.q_layers[-1],
                                                         n_filters=32,
                                                         filter_width=4,
                                                         filter_height=4,
                                                         stride_x=2,
                                                         stride_y=2,
                                                         weights_std=.01,
                                                         init_bias_value=0.01))
        if approximator == 'cuda_conv' or approximator == 'conv':

            self.q_layers.append(layers.DenseLayer(self.q_layers[-1],
                                                   n_outputs=256,
                                                   weights_std=0.01,
                                                   init_bias_value=0.1,
                                                   dropout=0,
                                                   nonlinearity=layers.rectify))

            self.q_layers.append(
                layers.DenseLayer(self.q_layers[-1],
                                  n_outputs=num_actions,
                                  weights_std=0.01,
                                  init_bias_value=0.1,
                                  dropout=0,
                                  nonlinearity=layers.identity))


        if approximator == 'none':
            self.q_layers.append(\
                layers.DenseLayerNoBias(self.q_layers[-1],
                                        n_outputs=num_actions,
                                        weights_std=0.00,
                                        dropout=0,
                                        nonlinearity=layers.identity))


        self.q_layers.append(layers.OutputLayer(self.q_layers[-1]))

        for i in range(len(self.q_layers)-1):
            print self.q_layers[i].get_output_shape()


        # Now create a network (using the same weights)
        # for next state q values
        self.next_layers = copy_layers(self.q_layers)
        self.next_layers[0] = layers.Input2DLayer(self._batch_size,
                                                  self._num_input_features,
                                                  self._img_width,
                                                  self._img_height,
                                                  self.scale_input_by)
        self.next_layers[1].input_layer = self.next_layers[0]

        self.rewards = T.col()
        self.actions = T.icol()

        # Build the loss function ...
        q_vals = self.q_layers[-1].predictions()
        next_q_vals = self.next_layers[-1].predictions()
        next_maxes = T.max(next_q_vals, axis=1, keepdims=True)
        target = self.rewards + discount * next_maxes
        target = theano.gradient.consider_constant(target)
        diff = target - q_vals
        # Zero out all entries for actions that were not chosen...
        mask = build_mask(T.zeros_like(diff), self.actions, 1.0)
        diff_masked = diff * mask
        error = T.mean(diff_masked ** 2)
        self._loss = error * diff_masked.shape[1] #

        self._parameters = layers.all_parameters(self.q_layers[-1])

        self._idx = T.lscalar('idx')

        # CREATE VARIABLES FOR INPUT AND OUTPUT
        self.states_shared = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.states_shared_next = theano.shared(
            np.zeros((1, 1, 1, 1), dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(
            np.zeros((1, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
        self.actions_shared = theano.shared(
            np.zeros((1, 1), dtype='int32'), broadcastable=(False, True))

        self._givens = \
            {self.q_layers[0].input_var:
             self.states_shared[self._idx*self._batch_size:
                                (self._idx+1)*self._batch_size, :, :, :],
             self.next_layers[0].input_var:
             self.states_shared_next[self._idx*self._batch_size:
                                     (self._idx+1)*self._batch_size, :, :, :],

             self.rewards:
             self.rewards_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :],
             self.actions:
             self.actions_shared[self._idx*self._batch_size:
                                 (self._idx+1)*self._batch_size, :]
             }

        if self.momentum != 0:
            self._updates = layers.gen_updates_rmsprop_and_nesterov_momentum(\
                self._loss, self._parameters, learning_rate=self.learning_rate,
                rho=self.decay, momentum=self.momentum, epsilon=1e-6)
        else:
            self._updates = layers.gen_updates_rmsprop(self._loss,
                self._parameters, learning_rate=self.learning_rate,
                rho=self.decay, epsilon=1e-6)

        self._train = theano.function([self._idx], self._loss,
                                      givens=self._givens,
                                      updates=self._updates)
        self._compute_loss = theano.function([self._idx],
                                             self._loss,
                                             givens=self._givens)
        self._compute_q_vals = \
            theano.function([self.q_layers[0].input_var],
                            self.q_layers[-1].predictions(),
                            on_unused_input='ignore')
示例#29
0
    def __init__(self,
                 batchSize,
                 numFrames,
                 inputHeight,
                 inputWidth,
                 numActions,
                 discountRate,
                 learningRate,
                 rho,
                 rms_epsilon,
                 momentum,
                 networkUpdateDelay,
                 useSARSAUpdate,
                 kReturnLength,
                 networkType="conv",
                 updateRule="deepmind_rmsprop",
                 batchAccumulator="sum",
                 clipDelta=1.0,
                 inputScale=255.0):

        self.batchSize = batchSize
        self.numFrames = numFrames
        self.inputWidth = inputWidth
        self.inputHeight = inputHeight
        self.inputScale = inputScale
        self.numActions = numActions
        self.discountRate = discountRate
        self.learningRate = learningRate
        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate = useSARSAUpdate
        self.kReturnLength = kReturnLength
        self.networkType = networkType
        self.updateRule = updateRule
        self.batchAccumulator = batchAccumulator
        self.clipDelta = clipDelta
        self.updateCounter = 0

        states = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards = T.col("rewards")
        actions = T.icol("actions")
        nextActions = T.icol("nextActions")
        terminals = T.icol("terminals")

        self.statesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.nextStatesShared = theano.shared(
            np.zeros((self.batchSize, self.numFrames, self.inputHeight,
                      self.inputWidth),
                     dtype=theano.config.floatX))
        self.rewardsShared = theano.shared(np.zeros(
            (self.batchSize, 1), dtype=theano.config.floatX),
                                           broadcastable=(False, True))
        self.actionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                    dtype='int32'),
                                           broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                        dtype='int32'),
                                               broadcastable=(False, True))
        self.terminalsShared = theano.shared(np.zeros((self.batchSize, 1),
                                                      dtype='int32'),
                                             broadcastable=(False, True))

        self.qValueNetwork = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth,
            self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork,
                                            states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight,
                self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(
                self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(
                self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)

        if self.useSARSAUpdate:
            target = rewards + terminals * (
                self.discountRate**
                self.kReturnLength) * nextQValues[T.arange(self.batchSize),
                                                  nextActions.reshape(
                                                      (-1, ))].reshape((-1, 1))
        else:
            target = rewards + terminals * (
                self.discountRate**self.kReturnLength) * T.max(
                    nextQValues, axis=1, keepdims=True)

        targetDifference = target - qValues[T.arange(self.batchSize),
                                            actions.reshape((-1, ))].reshape(
                                                (-1, 1))

        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart**2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")

        networkParameters = lasagne.layers.helper.get_all_params(
            self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters,
                                                    self.learningRate,
                                                    self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters,
                                              self.learningRate, self.rho,
                                              self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters,
                                          self.learningRate)
        else:
            raise ValueError(
                "Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None,
                                                   self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards: self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues],
                                              updates=updates,
                                              givens=lossGivens,
                                              on_unused_input='warn')
        self.__computeQValues = theano.function(
            [], qValues, givens={states: self.statesShared})
示例#30
0
    def initialize_network(self):
        """
        :description: this method initializes the network, updates, and theano functions for training and 
            retrieving q values. Here's an outline: 

            1. build the q network and target q network
            2. initialize theano symbolic variables used for compiling functions
            3. initialize the theano numeric variables used as input to functions
            4. formulate the symbolic loss 
            5. formulate the symbolic updates 
            6. compile theano functions for training and for getting q_values
        """
        batch_size, input_shape = self.batch_size, self.input_shape
        lasagne.random.set_rng(self.rng)

        # 1. build the q network and target q network
        self.l_out = self.build_network(input_shape, self.num_actions, batch_size)
        self.next_l_out = self.build_network(input_shape, self.num_actions, batch_size)
        self.reset_target_network()

        # 2. initialize theano symbolic variables used for compiling functions
        states = T.tensor4('states')
        actions = T.icol('actions')
        rewards = T.col('rewards')
        next_states = T.tensor4('next_states')
        # terminals are used to indicate a terminal state in the episode and hence a mask over the future
        # q values i.e., Q(s',a')
        terminals = T.icol('terminals')

        # 3. initialize the theano numeric variables used as input to functions
        self.states_shape = (batch_size,) + (1,) + input_shape
        self.states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.next_states_shared = theano.shared(np.zeros(self.states_shape, dtype=theano.config.floatX))
        self.rewards_shared = theano.shared(np.zeros((batch_size, 1), dtype=theano.config.floatX), 
            broadcastable=(False, True))
        self.actions_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        self.terminals_shared = theano.shared(np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # 4. formulate the symbolic loss 
        q_vals = lasagne.layers.get_output(self.l_out, states)
        next_q_vals = lasagne.layers.get_output(self.next_l_out, next_states)
        target = (rewards +
                 (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        # reshape((-1,)) == 'make a row vector', reshape((-1, 1) == 'make a column vector'
        diff = target - q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))


        # a lot of the deepmind work clips the td error at 1 so we do that here
        # the problem is that gradient backpropagating through this minimum node
        # will be zero if diff is larger then 1.0 (because changing params before
        # the minimum does not impact the output of the minimum). To account for 
        # this we take the part of the td error (magnitude) greater than 1.0 and simply
        # add it to the loss, which allows gradient to backprop but just linearly
        # in the td error rather than quadratically
        quadratic_part = T.minimum(abs(diff), 1.0)
        linear_part = abs(diff) - quadratic_part
        loss = 0.5 * quadratic_part ** 2 + linear_part
        loss = T.mean(loss) + self.regularization * regularize_network_params(self.l_out, l2)

        # 5. formulate the symbolic updates 
        params = lasagne.layers.helper.get_all_params(self.l_out)  
        updates = self.initialize_updates(self.update_rule, loss, params, self.learning_rate)

        # 6. compile theano functions for training and for getting q_values
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        self._train = theano.function([], [loss, q_vals], updates=updates, givens=givens)
        self._get_q_values = theano.function([], q_vals, givens={states: self.states_shared})
示例#31
0
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):

        super(QPropKeras,
              self).__init__(model, n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        ## primary network
        self._model = model

        ## Target network for DPG
        self._modelTarget = copy.deepcopy(model)
        ## Target network for PPO
        self._modelTarget2 = copy.deepcopy(model)
        # self._modelTarget = model
        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._q_valsActA = self._model.getActorNetwork()(
            self._model._stateInput)[:, :self._action_length]
        self._q_valsActASTD = self._model.getActorNetwork()(
            self._model._stateInput)[:, self._action_length:]
        self._q_valsActTarget_State = self._modelTarget2.getActorNetwork()(
            self._model._stateInput)[:, :self._action_length]

        # self._q_valsActTarget_State = self._modelTarget.getActorNetwork()(self._model._stateInput)[:,:self._action_length]
        # self._q_valsActTargetSTD = self._modelTarget.getActorNetwork()(self._model._stateInput)[:,self._action_length:]

        self._q_valsActASTD = (T.ones_like(
            self._q_valsActA)) * self.getSettings()['exploration_rate']
        self._q_valsActTargetSTD = (T.ones_like(self._q_valsActTarget_State)
                                    ) * self.getSettings()['exploration_rate']

        self._Advantage = T.col("Advantage")
        self._Advantage.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._advantage_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                               broadcastable=(False, True))

        self._LEARNING_PHASE = T.scalar(
            dtype='uint8', name='keras_learning_phase')  # 0 = test, 1 = train

        self._QProp_N = T.col("QProp_N")
        self._QProp_N.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))
        self._QProp_N_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                             broadcastable=(False, True))

        self._q_function = self._model.getCriticNetwork()(
            [self._model._stateInput, self._q_valsActA])
        self._q_function_Target = self._model.getCriticNetwork()(
            [self._model._stateInput, self._model._actionInput])
        # self._value = self._model.getCriticNetwork()([self._model._stateInput, K.learning_phase()])
        self._value_Target = self._modelTarget2.getValueFunction()(
            [self._model._stateInput])
        self._value = self._model.getValueFunction()([self._model._stateInput])
        # self._value = self._model.getCriticNetwork()([self._model._stateInput])

        self._actor_entropy = 0.5 * T.mean((2 * np.pi * self._q_valsActASTD))

        ## Compute on-policy policy gradient
        self._prob = likelihood(self._model._actionInput, self._q_valsActA,
                                self._q_valsActASTD, self._action_length)
        ### How should this work if the target network is very odd, as in not a slightly outdated copy.
        self._prob_target = likelihood(self._model._actionInput,
                                       self._q_valsActTarget_State,
                                       self._q_valsActTargetSTD,
                                       self._action_length)
        ## This does the sum already
        self._r = (self._prob / self._prob_target)
        self._actLoss_ = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (self._r), self._Advantage)
        ppo_epsilon = self.getSettings()['kl_divergence_threshold']
        self._actLoss_2 = theano.tensor.elemwise.Elemwise(theano.scalar.mul)(
            (theano.tensor.clip(self._r, 1.0 - ppo_epsilon,
                                1 + ppo_epsilon), self._Advantage))
        self._actLoss_ = theano.tensor.minimum((self._actLoss_),
                                               (self._actLoss_2))
        # self._actLoss = ((T.mean(self._actLoss_) )) + -self._actor_regularization
        # self._actLoss = (-1.0 * (T.mean(self._actLoss_) + (self.getSettings()['std_entropy_weight'] * self._actor_entropy )))
        self._actLoss = -1.0 * (T.mean(self._actLoss_) +
                                T.mean(self._QProp_N * self._q_function))
        self._actLoss_PPO = -1.0 * (T.mean(self._actLoss_))

        # self._policy_grad = T.grad(self._actLoss ,  self._actionParams)

        QPropKeras.compile(self)
示例#32
0
    def __init__(self,  mean_doc_size, input, input_sums,
        n_visible=784, n_hidden=500, W = None, hbias = None, vbias = None, 
        numpy_rng = None, theano_rng = None):
        """ 
        RBM constructor. Defines the parameters of the model along with
        basic operations for inferring hidden from visible (and vice-versa), 
        as well as for performing CD updates.

        :param input: None for standalone RBMs or symbolic variable if RBM is
        part of a larger graph.

        :param n_visible: number of visible units

        :param n_hidden: number of hidden units

        :param W: None for standalone RBMs or symbolic variable pointing to a
        shared weight matrix in case RBM is part of a DBN network in a DBN,
        the weights are shared between RBMs and layers of a MLP

        :param hbias: None for standalone RBMs or symbolic variable pointing 
        to a shared hidden units bias vector in case RBM is part of a 
        different network

        :param vbias: None for standalone RBMs or a symbolic variable 
        pointing to a shared visible units bias
        """

        self.mean_doc_size = mean_doc_size

        self.n_visible = n_visible
        self.n_hidden  = n_hidden


        if numpy_rng is None:    
            # create a number generator 
            numpy_rng = numpy.random.RandomState(1234)

        if theano_rng is None : 
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        if W is None : 
           # W is initialized with `initial_W` which is uniformely sampled
           # from -4*sqrt(6./(n_visible+n_hidden)) and 4*sqrt(6./(n_hidden+n_visible))
           # the output of uniform if converted using asarray to dtype 
           # theano.config.floatX so that the code is runable on GPU
           initial_W = numpy.asarray( numpy_rng.uniform( 
                     low = -4.*numpy.sqrt(6./(n_hidden+n_visible)), 
                     high = 4.*numpy.sqrt(6./(n_hidden+n_visible)), 
                     size = (n_visible, n_hidden)), 
                     dtype = theano.config.floatX)
           initial_W *= 1/self.mean_doc_size
           # theano shared variables for weights and biases
           W = theano.shared(value = initial_W, name = 'W')

        if hbias is None :
           # create shared variable for hidden units bias
           hbias = theano.shared(value = numpy.zeros(n_hidden, 
                               dtype = theano.config.floatX), name='hbias')

        if vbias is None :
            # create shared variable for visible units bias
            vbias = theano.shared(value =numpy.zeros(n_visible, 
                                dtype = theano.config.floatX),name='vbias')


        # initialize input layer for standalone RBM or layer0 of DBN
        self.input = input 
        self.input_sums = input_sums
        if not input:
            self.input = T.matrix('input')
            self.input_sums = T.col('input_sums')

        self.binomial_approx_val = theano.shared(value = float(100000), name = 'binomial_approx_val')

        self.W          = W
        self.hbias      = hbias
        self.vbias      = vbias
        self.theano_rng = theano_rng
        # **** WARNING: It is not a good idea to put things in this list 
        # other than shared variables created in this function.
        self.params     = [self.W, self.hbias, self.vbias]
示例#33
0
def col(name):
    return T.col(name)
示例#34
0
from __future__ import print_function
__author__ = 'frankhe'
import theano.tensor as T
from theano.compile import function
import numpy as np

num_actions = 3
batch_size = None

q_s = T.matrix()
a = T.col()

# batch_size = q_s.shape[0]
# out = q_s[range(batch_size), a.reshape(batch_size)]

mask = T.eq(T.arange(num_actions).reshape((1, -1)), a.reshape((-1, 1)))
out_t = q_s * mask
out = T.sum(out_t, axis=1, keepdims=True)

f = function([q_s, a], out)

q_s_ = np.random.rand(5, num_actions)
a_ = np.array([1, 0, 2, 1, 2]).reshape((5, 1))
print(f(q_s_, a_))
示例#35
0
    def __init__(self, state_length, action_length, state_bounds, action_bounds, settings_):

        super(ForwardDynamicsDenseNetworkDropoutTesting,self).__init__(state_length, action_length, state_bounds, action_bounds, 0, settings_)
        
        batch_size=32
        # data types for model
        self._State = T.matrix("State")
        self._State.tag.test_value = np.random.rand(batch_size,self._state_length)
        self._ResultState = T.matrix("ResultState")
        self._ResultState.tag.test_value = np.random.rand(batch_size,self._state_length)
        self._Reward = T.col("Reward")
        self._Reward.tag.test_value = np.random.rand(self._batch_size,1)
        self._Action = T.matrix("Action")
        self._Action.tag.test_value = np.random.rand(batch_size, self._action_length)
        # create a small convolutional neural network
        input = lasagne.layers.InputLayer((None, self._state_length), self._State)
        self._stateInputVar = input.input_var
        actionInput = lasagne.layers.InputLayer((None, self._action_length), self._Action)
        self._actionInputVar = actionInput.input_var
        
        insert_action_later = True
        double_insert_action = False
        add_layers_after_action = False
        if (not insert_action_later or (double_insert_action)):
            input = lasagne.layers.ConcatLayer([input, actionInput])
        ## Activation types
        # activation_type = elu_mine
        # activation_type=lasagne.nonlinearities.tanh
        activation_type=lasagne.nonlinearities.leaky_rectify
        # activation_type=lasagne.nonlinearities.rectify
        # network = lasagne.layers.DropoutLayer(input, p=self._dropout_p, rescale=True)
        """
        network = lasagne.layers.DenseLayer(
                input, num_units=128,
                nonlinearity=activation_type)
        network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        layersAct = [network]
        
        network = lasagne.layers.DenseLayer(
                network, num_units=64,
                nonlinearity=activation_type)
        network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        layersAct.append(network)
        network = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]])
        
        network = lasagne.layers.DenseLayer(
                network, num_units=32,
                nonlinearity=activation_type)
        network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        layersAct.append(network)
        network = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]])
        ## This can be used to model the reward function
        self._reward_net = lasagne.layers.DenseLayer(
                network, num_units=1,
                nonlinearity=lasagne.nonlinearities.linear)
                # print ("Initial W " + str(self._w_o.get_value()) )
        """
        
        network = lasagne.layers.DenseLayer(
                input, num_units=128,
                nonlinearity=activation_type)
        # network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        # layersAct = [network]
        
        if ( insert_action_later ):
            ### Lets try adding the action input later on in the network
            if ( add_layers_after_action ):
                networkA = lasagne.layers.DenseLayer(
                        actionInput, num_units=32,
                        nonlinearity=activation_type)
                network = lasagne.layers.ConcatLayer([network, networkA])
            else:
                network = lasagne.layers.ConcatLayer([network, actionInput])
        
        network = lasagne.layers.DenseLayer(
                network, num_units=64,
                nonlinearity=activation_type)
        # network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        
        # layersAct.append(network)
        # network = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]])
        
        network = lasagne.layers.DenseLayer(
                network, num_units=32,
                nonlinearity=activation_type)
        # network = weight_norm(network)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        
        
        # layersAct.append(network)
        # network = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]])
        # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        network = lasagne.layers.DenseLayer(
                network, num_units=8,
                nonlinearity=activation_type)
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        """
        network = lasagne.layers.DenseLayer(
                network, num_units=8,
                nonlinearity=activation_type)
        """
        ## This can be used to model the reward function
        self._reward_net = lasagne.layers.DenseLayer(
                network, num_units=1,
                nonlinearity=lasagne.nonlinearities.linear)
                # print ("Initial W " + str(self._w_o.get_value()) )
                  
        # networkAct = lasagne.layers.DropoutLayer(input, p=self._dropout_p, rescale=True)
        networkAct = lasagne.layers.DenseLayer(
                input, num_units=256,
                nonlinearity=activation_type)
        networkAct = weight_norm(networkAct)
        networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)
        layersAct = [networkAct]
        
        networkAct = lasagne.layers.DenseLayer(
                networkAct, num_units=128,
                nonlinearity=activation_type)
        networkAct = weight_norm(networkAct)
        networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)
        
        if ( insert_action_later ):
            ### Lets try adding the action input later on in the network
            if ( add_layers_after_action ):
                networkActA = lasagne.layers.DenseLayer(
                    actionInput, num_units=64,
                    nonlinearity=activation_type)
                networkAct = lasagne.layers.ConcatLayer([networkAct, networkActA])
            else:
                networkAct = lasagne.layers.ConcatLayer([networkAct, actionInput])
            
        
        layersAct.append(networkAct)
        networkAct = lasagne.layers.ConcatLayer([layersAct[1], layersAct[0]])
        
        networkAct = lasagne.layers.DenseLayer(
                networkAct, num_units=128,
                nonlinearity=activation_type)
        networkAct = weight_norm(networkAct)
        networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)
        layersAct.append(networkAct)
        networkAct = lasagne.layers.ConcatLayer([layersAct[2], layersAct[1], layersAct[0]])
    
        self._forward_dynamics_net = lasagne.layers.DenseLayer(
                networkAct, num_units=self._state_length,
                nonlinearity=lasagne.nonlinearities.linear)
                # print ("Initial W " + str(self._w_o.get_value()) )
                
        if (('use_stochastic_forward_dynamics' in self._settings) and 
            self._settings['use_stochastic_forward_dynamics']):
            with_std = lasagne.layers.DenseLayer(
                    networkAct, num_units=self._state_length,
                    nonlinearity=theano.tensor.nnet.softplus)
            self._forward_dynamics_net = lasagne.layers.ConcatLayer([self._forward_dynamics_net, with_std], axis=1)
                
        self._states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._actions_shared = theano.shared(
            np.zeros((batch_size, self._action_length), dtype=theano.config.floatX),
            )
        
        self._rewards_shared = theano.shared(
            np.zeros((self._batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))
示例#36
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batchSize, network_type, 
                 update_rule, batch_accumulator, randomState, frame_scale=255.0):
        """ Initialize environment

        Arguments:
            environment - the environment (class Env) 
            num_elements_in_batch - list of k integers for the number of each element kept as belief state
            num_actions - int
            discount - float
            learning_rate - float
            rho, rms_epsilon, momentum - float, float, float
            ...
            network_type - string 
            ...           
        """

        self._environment = environment
        
        self._batchSize = batchSize
        self._inputDimensions = self._environment.inputDimensions()
        self._nActions = self._environment.nActions()
        self._df = 0
        self.rho = rho
        self._lr = 0
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._randomState = randomState
        
        lasagne.random.set_rng(self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._inputDimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batchSize,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._inputDimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        self.l_out, self.l_outs_conv, shape_after_conv = self._build(network_type, states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_l_out, self.next_l_outs_conv, shape_after_conv = self._build(network_type, next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batchSize, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batchSize, 1), dtype='int32'),
            broadcastable=(False, True))


        q_vals = lasagne.layers.get_output(self.l_out)        
        
        next_q_vals = lasagne.layers.get_output(self.next_l_out)
        
        max_next_q_vals=T.max(next_q_vals, axis=1, keepdims=True)
        
        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)
        
        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=q_vals[T.arange(batchSize), actions.reshape((-1,))].reshape((-1, 1))

        diff = target - q_val

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)

        for conv_param in self.l_outs_conv:
            for p in lasagne.layers.helper.get_all_params(conv_param):
                params.append(p)
        
            
        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
        if update_rule == 'deepmind_rmsprop':
            grads = get_or_compute_grads(loss, params)
            updates = deepmind_rmsprop(loss, params, grads, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, thelr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, thelr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([thediscount, thelr], [loss, q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
示例#37
0
文件: q_network.py 项目: torgeha/dqn
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        # print "NETWORK---------------------------"
        # print "input width ", self.input_width
        # print "input height", self.input_height
        # print "num actiuons", self.num_actions
        # print "num frames", self.num_frames
        # print "batch size", self.batch_size
        # print "discount", self.discount
        # print "rho", self.rho
        # print "lr", self.lr
        # print "rms_epsilon", self.rms_epsilon
        # print "momentum", self.momentum
        # print "clip_delta", self.clip_delta
        # print "freeze_ intercal", self.freeze_interval
        # print "rng", self.rng

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Shared variables for training from a minibatch of replayed state transitions,
        # each consisting of num_frames + 1 (due to overlap) images, along with
        # the chosen action and resulting reward and termnial status.
        self.imgs_shared = theano.shared(
            np.zeros((batch_size, num_frames + 1, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # Shared variable for a single state, to calculate q_vals
        self.state_shared = theano.shared(
            np.zeros((num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        terminalsX = terminals.astype(theano.config.floatX)
        actionmask = T.eq(T.arange(num_actions).reshape((1, -1)),
                          actions.reshape((-1, 1))).astype(theano.config.floatX)

        target = (rewards +
                  (T.ones_like(terminalsX) - terminalsX) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        output = (q_vals * actionmask).sum(axis=1).reshape((-1, 1))
        diff = target - output

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        train_givens = {
            states: self.imgs_shared[:, :-1],
            next_states: self.imgs_shared[:, 1:],
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss], updates=updates,
                                      givens=train_givens)

        q_givens = {
            states: self.state_shared.reshape((1,
                                               self.num_frames,
                                               self.input_height,
                                               self.input_width))
        }
        self._q_vals = theano.function([], q_vals[0], givens=q_givens)
示例#38
0
    def __init__(self, input_width, input_height, num_channels, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, network_params, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.num_channels = num_channels
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng

        self.lstm = None
        self.next_lstm = None

        logging.debug('network parameters', network_params)
        self.network_params = network_params

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        networks = self.build_network(network_type, num_channels, input_width, input_height,
                                        num_actions, num_frames, None)
        if isinstance(networks, tuple):
            self.l_out = networks[0]
            self.lstm = networks[1]
        else:
            self.l_out = networks

        # theano.compile.function_dump('network.dump', self.l_out)
        if self.freeze_interval > 0:
            next_networks = self.build_network(network_type, num_channels, input_width,
                                                 input_height, num_actions,
                                                 num_frames, None)

            if isinstance(next_networks, tuple):
                self.next_l_out = next_networks[0]
                self.next_lstm = next_networks[1]
            else:
                self.next_l_out = next_networks

            self.reset_q_hat()

        # This really really needs to be floats for now.
        # It makes sense if they use it for computations
        btensor5 = T.TensorType(theano.config.floatX, (False,) * 5)
        states = btensor5('states')
        next_states = btensor5('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        # Apparently needed for some layers with a variable input size
        # Weird, because the others just allow a None batch size,
        # but let's just play safe for now
        # For now, it should always look exactly like states
        # (n_batch, n_time_steps)
        # mask = T.imatrix('mask')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='states')

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, num_channels, input_height, input_width),
                     dtype=theano.config.floatX), name='next_states')

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True), name='rewards')

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True), name='actions')

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        # self.mask_shared = theano.shared(np.ones((batch_size, num_frames),
        #     dtype='int32'))

        # lstmout = lasagne.layers.get_output(self.lstm, states / input_scale)

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
                # mask_input=mask)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale
                                                    )
                                                    # mask_input=mask)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(target.shape[0]),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        # print params
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            update_for = lambda params: deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            update_for = lambda params: lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            update_for = lambda params: lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        updates = update_for(params)

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        # # Super mega shady stuff
        # # Somehow an update sneaks in for cell and hid. Kill it with fire
        if self.lstm:
            delete_keys = [k for k, v in updates.items() if k.name in ['cell', 'hid']]
            # print delete_keys
            for key in delete_keys:
                del updates[key]

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
示例#39
0
    def __init__(self, batchSize, numFrames, inputHeight, inputWidth, numActions, 
        discountRate, learningRate, rho, rms_epsilon, momentum, networkUpdateDelay, useSARSAUpdate, kReturnLength,
        networkType = "conv", updateRule = "deepmind_rmsprop", batchAccumulator = "sum", clipDelta = 1.0, inputScale = 255.0):
        
        self.batchSize          = batchSize
        self.numFrames          = numFrames
        self.inputWidth         = inputWidth
        self.inputHeight        = inputHeight
        self.inputScale         = inputScale
        self.numActions         = numActions
        self.discountRate       = discountRate
        self.learningRate       = learningRate
        self.rho                = rho
        self.rms_epsilon        = rms_epsilon
        self.momentum           = momentum
        self.networkUpdateDelay = networkUpdateDelay
        self.useSARSAUpdate     = useSARSAUpdate
        self.kReturnLength      = kReturnLength
        self.networkType        = networkType
        self.updateRule         = updateRule
        self.batchAccumulator   = batchAccumulator
        self.clipDelta          = clipDelta
        self.updateCounter      = 0

        states     = T.tensor4("states")
        nextStates = T.tensor4("nextStates")
        rewards    = T.col("rewards")
        actions    = T.icol("actions")
        nextActions= T.icol("nextActions")
        terminals  = T.icol("terminals")

        self.statesShared      = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.nextStatesShared  = theano.shared(np.zeros((self.batchSize, self.numFrames, self.inputHeight, self.inputWidth), dtype=theano.config.floatX))
        self.rewardsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype=theano.config.floatX), broadcastable=(False, True))
        self.actionsShared     = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.nextActionsShared = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))
        self.terminalsShared   = theano.shared(np.zeros((self.batchSize, 1), dtype='int32'), broadcastable=(False, True))

        self.qValueNetwork  = DeepNetworks.buildDeepQNetwork(
            self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)

        qValues = lasagne.layers.get_output(self.qValueNetwork, states / self.inputScale)

        if self.networkUpdateDelay > 0:
            self.nextQValueNetwork = DeepNetworks.buildDeepQNetwork(
                self.batchSize, self.numFrames, self.inputHeight, self.inputWidth, self.numActions, self.networkType)
            self.resetNextQValueNetwork()
            nextQValues = lasagne.layers.get_output(self.nextQValueNetwork, nextStates / self.inputScale)

        else:
            nextQValues = lasagne.layers.get_output(self.qValueNetwork, nextStates / self.inputScale)
            nextQValues = theano.gradient.disconnected_grad(nextQValues)


        if self.useSARSAUpdate:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * nextQValues[T.arange(self.batchSize), nextActions.reshape((-1,))].reshape((-1, 1))
        else:
            target = rewards + terminals * (self.discountRate ** self.kReturnLength) * T.max(nextQValues, axis = 1, keepdims = True)

        targetDifference = target - qValues[T.arange(self.batchSize), actions.reshape((-1,))].reshape((-1, 1))


        quadraticPart = T.minimum(abs(targetDifference), self.clipDelta)
        linearPart = abs(targetDifference) - quadraticPart

        # if self.clipDelta > 0:
        #     targetDifference = targetDifference.clip(-1.0 * self.clipDelta, self.clipDelta)

        if self.batchAccumulator == "sum":
            # loss = T.sum(targetDifference ** 2)
            loss = T.sum(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        elif self.batchAccumulator == "mean":
            # loss = T.mean(targetDifference ** 2)
            loss = T.mean(0.5 * quadraticPart ** 2 + self.clipDelta * linearPart)
        else:
            raise ValueError("Bad Network Accumulator. {sum, mean} expected")


        networkParameters = lasagne.layers.helper.get_all_params(self.qValueNetwork)

        if self.updateRule == "deepmind_rmsprop":
            updates = DeepNetworks.deepmind_rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "rmsprop":
            updates = lasagne.updates.rmsprop(loss, networkParameters, self.learningRate, self.rho, self.rms_epsilon)
        elif self.updateRule == "sgd":
            updates = lasagne.updates.sgd(loss, networkParameters, self.learningRate)
        else:
            raise ValueError("Bad update rule. {deepmind_rmsprop, rmsprop, sgd} expected")

        if self.momentum > 0:
            updates.lasagne.updates.apply_momentum(updates, None, self.momentum)

        lossGivens = {
            states: self.statesShared,
            nextStates: self.nextStatesShared,
            rewards:self.rewardsShared,
            actions: self.actionsShared,
            nextActions: self.nextActionsShared,
            terminals: self.terminalsShared
        }

        self.__trainNetwork = theano.function([], [loss, qValues], updates=updates, givens=lossGivens, on_unused_input='warn')
        self.__computeQValues = theano.function([], qValues, givens={states: self.statesShared})
示例#40
0
    def __init__(self, input, n_in, n_out):

        hidden_size = 36
        batch_size = 32
        self._w_h = init_weights((n_in, hidden_size))
        self._b_h = init_b_weights((1, hidden_size))
        # self._b_h = init_b_weights((hidden_size,))
        self._w_h2 = init_weights((hidden_size, hidden_size))
        self._b_h2 = init_b_weights((1, hidden_size))
        # self._b_h2 = init_b_weights((hidden_size,))
        # self._w_o = init_tanh(hidden_size, n_out)
        self._w_o = init_weights((hidden_size, n_out))
        self._b_o = init_b_weights((1, n_out))
        # self._b_o = init_b_weights((n_out,))

        self.updateTargetModel()
        self._w_h_old = init_weights((n_in, hidden_size))
        self._w_h2_old = init_weights((hidden_size, hidden_size))
        self._w_o_old = init_tanh(hidden_size, n_out)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.00025
        self._discount_factor = 0.99

        self._weight_update_steps = 5000
        self._updates = 0

        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, 2)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, 2)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int32'))
        # Q_val = T.fmatrix()

        # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1)))
        # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True)
        _py_xA = self.model(State, self._w_h, self._b_h, self._w_h2,
                            self._b_h2, self._w_o, self._b_o, 0.0, 0.0)
        _py_xB = self.model(State, self._w_h_old, self._b_h_old,
                            self._w_h2_old, self._b_h2_old, self._w_o_old,
                            self._b_o_old, 0.0, 0.0)
        self._y_predA = T.argmax(_py_xA, axis=1)
        self._y_predB = T.argmax(_py_xB, axis=1)
        self._q_funcA = T.mean(
            (self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2,
                        self._w_o, self._b_o, 0.0,
                        0.0))[T.arange(batch_size),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        self._q_funcB = T.mean(
            (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old,
                        self._b_h2_old, self._w_o_old, self._b_o_old, 0.0,
                        0.0))[T.arange(batch_size),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        # q_val = py_x
        # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self._L1_A = (abs(self._w_h).sum() + abs(self._w_h2).sum() +
                      abs(self._w_o).sum())
        self._L1_B = (abs(self._w_h_old).sum() + abs(self._w_h2_old).sum() +
                      abs(self._w_o_old).sum())
        self._L1_reg = 0.0
        self._L2_reg = 0.001
        # L2 norm ; one regularization option is to enforce
        # L2 norm to be small
        self._L2_A = ((self._w_h**2).sum() + (self._w_h2**2).sum() +
                      (self._w_o**2).sum())
        self._L2_B = ((self._w_h_old**2).sum() + (self._w_h2_old**2).sum() +
                      (self._w_o_old**2).sum())

        # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
        # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State))
        deltaA = ((Reward + (self._discount_factor * T.max(self.model(
            ResultState, self._w_h_old, self._b_h_old, self._w_h2_old,
            self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5),
                                                           axis=1,
                                                           keepdims=True))) -
                  (self.model(State, self._w_h, self._b_h, self._w_h2,
                              self._b_h2, self._w_o, self._b_o, 0.2,
                              0.5))[T.arange(Action.shape[0]),
                                    Action.reshape((-1, ))].reshape((-1, 1)))
        deltaB = (
            (Reward +
             (self._discount_factor *
              T.max(self.model(ResultState, self._w_h, self._b_h, self._w_h2,
                               self._b_h2, self._w_o, self._b_o, 0.2, 0.5),
                    axis=1,
                    keepdims=True))) -
            (self.model(State, self._w_h_old, self._b_h_old, self._w_h2_old,
                        self._b_h2_old, self._w_o_old, self._b_o_old, 0.2,
                        0.5))[T.arange(Action.shape[0]),
                              Action.reshape((-1, ))].reshape((-1, 1)))
        # bellman_cost = T.mean( 0.5 * ((delta) ** 2 ))
        bellman_costA = T.mean(0.5 * ((deltaA)**2)) + (
            self._L2_reg * self._L2_A) + (self._L1_reg * self._L1_A)
        bellman_costB = T.mean(0.5 * ((deltaB)**2)) + (
            self._L2_reg * self._L2_B) + (self._L1_reg * self._L1_B)

        paramsA = [
            self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o
        ]
        paramsB = [
            self._w_h_old, self._b_h_old, self._w_h2_old, self._b_h2_old,
            self._w_o_old, self._b_o_old
        ]
        # updates = sgd(bellman_cost, params, lr=self._learning_rate)
        updatesA = rlTDSGD(self._q_funcA,
                           T.mean(deltaA),
                           paramsA,
                           lr=self._learning_rate)
        updatesB = rlTDSGD(self._q_funcB,
                           T.mean(deltaB),
                           paramsB,
                           lr=self._learning_rate)
        # updates = RMSprop(bellman_cost, params, lr=self._learning_rate)
        # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01)
        # updatesA = lasagne.updates.rmsprop(self._q_funcA, paramsA, self._learning_rate * -T.mean(deltaA), 0.95, 0.01)
        # updatesB = lasagne.updates.rmsprop(self._q_funcB, paramsB, self._learning_rate * -T.mean(deltaB), 0.95, 0.01)

        self._trainA = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_costA,
            updates=updatesA,
            allow_input_downcast=True)
        self._trainB = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_costB,
            updates=updatesB,
            allow_input_downcast=True)
        self._bellman_errorA = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=deltaA,
            allow_input_downcast=True)
        self._bellman_errorB = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=deltaB,
            allow_input_downcast=True)
        self._q_valuesA = theano.function(inputs=[State],
                                          outputs=_py_xA,
                                          allow_input_downcast=True)
        self._q_valuesB = theano.function(inputs=[State],
                                          outputs=_py_xB,
                                          allow_input_downcast=True)
        self._py_xA = theano.function(inputs=[State],
                                      outputs=_py_xA,
                                      allow_input_downcast=True)
        self._py_xB = theano.function(inputs=[State],
                                      outputs=_py_xB,
                                      allow_input_downcast=True)

        x, y = T.matrices('x', 'y')
        z_lazy = ifelse(T.gt(T.max(x, axis=1)[0],
                             T.max(y, axis=1)[0]), T.argmax(x, axis=1),
                        T.argmax(y, axis=1))
        self._f_lazyifelse = theano.function([x, y],
                                             z_lazy,
                                             mode=theano.Mode(linker='vm'))
示例#41
0
    def _update_classifier(self, data, labels, w, classes):
        """Update the classifier parameters theta and bias

        Parameters
        ----------

        data : list of 2D arrays, element i has shape=[voxels_i, samples_i]
            Each element in the list contains the fMRI data of one subject for
            the classification task.

        labels : list of arrays of int, element i has shape=[samples_i]
            Each element in the list contains the labels for the data samples
            in data_sup.

        w : list of 2D array, element i has shape=[voxels_i, features]
            The orthogonal transforms (mappings) :math:`W_i` for each subject.

        classes : int
            The number of classes in the classifier.


        Returns
        -------

        theta : array, shape=[features, classes]
            The MLR parameter for the class planes.

        bias : array shape=[classes,]
            The MLR parameter for class biases.
        """

        # Stack the data and labels for training the classifier
        data_stacked, labels_stacked, weights = \
            SSSRM._stack_list(data, labels, w)

        features = w[0].shape[1]
        total_samples = weights.size

        data_th = S.shared(data_stacked.astype(theano.config.floatX))
        val_ = S.shared(labels_stacked)
        total_samples_S = S.shared(total_samples)
        theta_th = T.matrix(name='theta', dtype=theano.config.floatX)
        bias_th = T.col(name='bias', dtype=theano.config.floatX)
        constf2 = S.shared(self.alpha / self.gamma, allow_downcast=True)
        weights_th = S.shared(weights)

        log_p_y_given_x = \
            T.log(T.nnet.softmax((theta_th.T.dot(data_th.T)).T + bias_th.T))
        f = -constf2 * T.sum(
            (log_p_y_given_x[T.arange(total_samples_S), val_]) /
            weights_th) + 0.5 * T.sum(theta_th**2)

        manifold = Product((Euclidean(features,
                                      classes), Euclidean(classes, 1)))
        problem = Problem(manifold=manifold,
                          cost=f,
                          arg=[theta_th, bias_th],
                          verbosity=0)
        solver = ConjugateGradient(mingradnorm=1e-6)
        solution = solver.solve(problem)
        theta = solution[0]
        bias = solution[1]

        del constf2
        del theta_th
        del bias_th
        del data_th
        del val_
        del solver
        del solution

        return theta, bias
theano_dot = theano.function([theano_matrix1, theano_matrix2],
                             T.dot(theano_matrix1, theano_matrix2),
                             name='theano_dot')

theano_scalar = T.fscalar(name='theano_scalar')
theano_scale = theano.function([theano_matrix1, theano_scalar],
                               theano_matrix1 * theano_scalar,
                               name='scale')

# elementwise product
theano_multiply = theano.function([theano_matrix1, theano_matrix2],
                                  theano_matrix1 * theano_matrix2,
                                  name='theano_multiply')

theano_row_vector = T.row(name='theano_row_vector')
theano_col_vector = T.col(name='theano_col_vector')

theano_subtract_row = theano.function([theano_matrix1, theano_row_vector],
                                      theano_matrix1 - theano_row_vector,
                                      name='theano_subtract_row')
theano_divide_row = theano.function([theano_matrix1, theano_row_vector],
                                    theano_matrix1 / theano_row_vector,
                                    name='theano_divide_row')
theano_subtract_col = theano.function([theano_matrix1, theano_col_vector],
                                      theano_matrix1 - theano_col_vector,
                                      name='theano_subtract_col')
theano_divide_col = theano.function([theano_matrix1, theano_col_vector],
                                    theano_matrix1 / theano_col_vector,
                                    name='theano_divide_col')

theano_var1 = theano.function([theano_matrix1],
示例#43
0
    def __init__(self, input_width, input_height, avail_actions, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, rng, train_all, input_scale=255.0):

        self.input_width = input_width
        self.input_height = input_height
        self.avail_actions = avail_actions
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self.rng = rng
        self.train_all = train_all

        lasagne.random.set_rng(self.rng)

        self.update_counter = 0

        print "num_actions: " + str(num_actions)
        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)

        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            #
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
示例#44
0
    def __init__(self, datasetPaths=None, keyPoints=None):

        self.xMax = 1024.0
        self.yMax = 576.0
        self.numKeyPoints = 68
        self.loadPicasaTubePickle()

        loadPrev = 1

        if loadPrev == 1:
            pkl_file = open('faceAlign2.pkl', 'rb')
            self.pose, self.landmarks, self.poseDict, self.images, self.poseCat = pickle.load(
                pkl_file)
            pkl_file.close()
        else:
            self.loadData()
            output = open('faceAlign2.pkl', 'wb')
            data = (self.pose, self.landmarks, self.poseDict, self.images,
                    self.poseCat)
            pickle.dump(data, output)
            output.close()
            return

        self.eeta = 0.0000001
        self.mu = theano.shared(10 * numpy.random.random(
            (2 * self.numKeyPoints, 1)))
        self.S = theano.shared(numpy.eye(2 * self.numKeyPoints))
        self.alpha = theano.shared(0.1 * numpy.ones(
            (2 * self.numKeyPoints, 1)))
        theano.config.compute_test_value = 'warn'
        oneCol = T.col('oneCol')
        oneCol.tag.test_value = numpy.ones((self.numKeyPoints, 1))
        pi_t = T.col('pi_t')
        pi_t.tag.test_value = numpy.random.random((2 * self.numKeyPoints, 1))
        temp = numpy.random.random((3, 3))
        #temp = numpy.zeros((3,3))
        temp[2, :] = [0, 0, 1]
        self.A_t = theano.shared(temp, name='A_t')
        #print_A_t = theano.printing.Print('r_t1')(A_t)
        z_t = T.col('z_t')
        z_t.tag.test_value = numpy.random.random((2 * self.numKeyPoints, 1))
        z_t1 = z_t.reshape((self.numKeyPoints, 2))

        pts = T.concatenate((z_t1, oneCol), axis=1)
        #      pts = theano.printing.Print('pts')(pts)
        r_t = T.dot(self.A_t, pts.transpose()).transpose()
        r_t1 = r_t[:, 0:2].reshape((2 * self.numKeyPoints, 1))
        #pi_tt = theano.printing.Print('pi_t before')(pi_t)
        diff = pi_t * (r_t1 - self.mu)
        difft = diff.reshape((1, 2 * self.numKeyPoints))
        #diff = theano.printing.Print('diff:')(diff)
        cost = T.max(T.dot(T.dot(difft, self.S), diff))
        #cost = theano.printing.Print('cost:')(cost)
        A_t_grad = T.grad(cost=cost, wrt=self.A_t)
        A_t_grad = T.basic.set_subtensor(A_t_grad[2, :], 0)
        #A_t_grad = theano.printing.Print('r_t1')(A_t_grad)
        update = (self.A_t, self.A_t - self.eeta * A_t_grad)
        self.align = theano.function(inputs=[pi_t, z_t, oneCol],
                                     outputs=[self.A_t, cost],
                                     updates=[update],
                                     on_unused_input='warn',
                                     allow_input_downcast=True)

        #for numpy optimization
        A_t_ = T.matrix('A_t_')
        #A_t_.tag.test_value = temp
        #A_t_ = A_t_.reshape((3,3))
        A_t_.tag.test_value = temp
        #print_A_t = theano.printing.Print('r_t1')(A_t)
        r_t_ = T.dot(A_t_, pts.transpose()).transpose()
        r_t1_ = r_t_[:, 0:2].reshape((2 * self.numKeyPoints, 1))
        #pi_tt = theano.printing.Print('pi_t before')(pi_t)
        diff_ = pi_t * (r_t1_ - self.mu)
        difft_ = diff_.reshape((1, 2 * self.numKeyPoints))

        #diff = theano.printing.Print('diff:')(diff)
        cost_1 = T.dot(T.dot(difft_, self.S), diff_)
        #cost_1 = theano.printing.Print('cost is:')(cost_1)
        cost_ = T.max(cost_1)

        A_t_grad_ = T.grad(cost=cost_, wrt=A_t_)
        A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2, :], 0)
        #A_t_grad_ = A_t_grad_.reshape((9,1))

        self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol],
                                    outputs=[cost_, A_t_grad_])
        i = T.iscalar('index')
        i.tag.test_value = 0
        subS = self.S[2 * i:2 * i + 2, 2 * i:2 * i + 2]
        #subS = theano.printing.Print('subS:')(self.S[2*i:2*i+2, 2*i:2*i+2])
        det = T.abs_(subS[0, 0] * subS[1, 1] - subS[0, 1] * subS[1, 0])
        subDiff = diff[(2 * i):((2 * i) + 2)]
        subDifft = difft[0][(2 * i):(2 * i + 2)]
        #intermed = theano.printing.Print('dotProd1:')(T.dot(subDifft,subS))
        intermed = T.dot(subDifft, subS)
        #intermed2 = theano.printing.Print('dotProd2:')(T.dot(intermed,subDiff))
        intermed2 = T.dot(intermed, subDiff)
        numrtr = T.exp(-0.5 * intermed2)
        k = 2
        dnmntr = T.sqrt((2**k) * det)
        q = numrtr / dnmntr
        temp = ((1 - self.alpha[2 * i:2 * i + 2]) *
                q) / (self.alpha[2 * i:2 * i + 2] +
                      (1 - self.alpha[2 * i:2 * i + 2]) * q)
        pi_t_out = T.basic.set_subtensor(pi_t[2 * i:2 * i + 2], temp)
        self.q_pi_update = theano.function(inputs=[i, oneCol, pi_t, z_t],
                                           outputs=[q, pi_t_out, r_t1],
                                           allow_input_downcast=True)

        self.train('12')
示例#45
0
    def __init__(self, input, n_in, n_out):

        hidden_size = 36
        batch_size = 32
        self._w_h = init_weights((n_in, hidden_size))
        self._b_h = init_b_weights((1, hidden_size))
        # self._b_h = init_b_weights((hidden_size,))
        self._w_h2 = init_weights((hidden_size, hidden_size))
        self._b_h2 = init_b_weights((1, hidden_size))
        # self._b_h2 = init_b_weights((hidden_size,))
        # self._w_o = init_tanh(hidden_size, n_out)
        self._w_o = init_weights((hidden_size, n_out))
        self._b_o = init_b_weights((1, n_out))
        # self._b_o = init_b_weights((n_out,))

        self.updateTargetModel()
        self._w_h_old = init_weights((n_in, hidden_size))
        self._w_h2_old = init_weights((hidden_size, hidden_size))
        self._w_o_old = init_tanh(hidden_size, n_out)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = 0.00025
        self._discount_factor = 0.99

        self._weight_update_steps = 5000
        self._updates = 0

        # data types for model
        State = T.dmatrix("State")
        State.tag.test_value = np.random.rand(batch_size, 2)
        ResultState = T.dmatrix("ResultState")
        ResultState.tag.test_value = np.random.rand(batch_size, 2)
        Reward = T.col("Reward")
        Reward.tag.test_value = np.random.rand(batch_size, 1)
        Action = T.icol("Action")
        Action.tag.test_value = np.zeros((batch_size, 1),
                                         dtype=np.dtype('int32'))
        # Q_val = T.fmatrix()

        # model = T.nnet.sigmoid(T.dot(State, self._w) + self._b.reshape((1, -1)))
        # self._model = theano.function(inputs=[State], outputs=model, allow_input_downcast=True)
        py_x = self.model(State, self._w_h, self._b_h, self._w_h2, self._b_h2,
                          self._w_o, self._b_o, 0.0, 0.0)
        y_pred = T.argmax(py_x, axis=1)
        q_func = T.mean((self.model(State, self._w_h, self._b_h, self._w_h2,
                                    self._b_h2, self._w_o, self._b_o, 0.0,
                                    0.0))[T.arange(batch_size),
                                          Action.reshape((-1, ))].reshape(
                                              (-1, 1)))
        # q_val = py_x
        # noisey_q_val = self.model(ResultState, self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o, 0.2, 0.5)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self._L1 = (abs(self._w_h).sum() + abs(self._w_h2).sum() +
                    abs(self._w_o).sum())
        self._L1_reg = 0.0
        self._L2_reg = 0.001
        # L2 norm ; one regularization option is to enforce
        # L2 norm to be small
        self._L2 = ((self._w_h**2).sum() + (self._w_h2**2).sum() +
                    (self._w_o**2).sum())

        # cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
        # delta = ((Reward.reshape((-1, 1)) + (self._discount_factor * T.max(self.model(ResultState), axis=1, keepdims=True)) ) - self.model(State))
        delta = ((Reward + (self._discount_factor * T.max(self.model(
            ResultState, self._w_h_old, self._b_h_old, self._w_h2_old,
            self._b_h2_old, self._w_o_old, self._b_o_old, 0.2, 0.5),
                                                          axis=1,
                                                          keepdims=True))) -
                 (self.model(State, self._w_h, self._b_h, self._w_h2,
                             self._b_h2, self._w_o, self._b_o, 0.2,
                             0.5))[T.arange(Action.shape[0]),
                                   Action.reshape((-1, ))].reshape((-1, 1)))
        # bellman_cost = T.mean( 0.5 * ((delta) ** 2 ))
        bellman_cost = T.mean(0.5 * ((delta)**2)) + (
            self._L2_reg * self._L2) + (self._L1_reg * self._L1)

        params = [
            self._w_h, self._b_h, self._w_h2, self._b_h2, self._w_o, self._b_o
        ]
        # updates = sgd(bellman_cost, params, lr=self._learning_rate)
        # updates = rlTDSGD(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = RMSprop(bellman_cost, params, lr=self._learning_rate)
        # updates = RMSpropRL(q_func, T.mean(delta), params, lr=self._learning_rate)
        # updates = lasagne.updates.rmsprop(bellman_cost, params, self._learning_rate, 0.95, 0.01)
        updates = lasagne.updates.rmsprop(q_func, params,
                                          self._learning_rate * -T.mean(delta),
                                          0.95, 0.01)

        self._train = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=bellman_cost,
            updates=updates,
            allow_input_downcast=True)
        self._predict = theano.function(inputs=[State],
                                        outputs=y_pred,
                                        allow_input_downcast=True)
        self._q_values = theano.function(inputs=[State],
                                         outputs=py_x,
                                         allow_input_downcast=True)
        self._bellman_error = theano.function(
            inputs=[State, Action, Reward, ResultState],
            outputs=delta,
            allow_input_downcast=True)
示例#46
0
	def _compile_train_function(self):
		state = T.tensor4(dtype = theano.config.floatX)
		action = T.col(dtype = 'uint8')
		reward = T.col(dtype = theano.config.floatX)
		terminal = T.col(dtype = 'int8')
		next_state = T.tensor4(dtype = theano.config.floatX)

		current_values_matrix = lasagne.layers.get_output(self.net, state)
		action_mask = T.eq(T.arange(self.num_action).reshape((1, -1))
						, action.reshape((-1, 1))).astype(theano.config.floatX)
		current_values = T.sum(current_values_matrix * action_mask
								, axis = 1).reshape((-1, 1))

		if self.algorithm == 'q_learning':
			if self.tnet is not None:
				target_values = lasagne.layers.get_output(self.tnet, next_state)
			else:
				target_values = lasagne.layers.get_output(self.net, next_state)
			bootstrap_values = T.max(target_values, axis = 1, keepdims = True)

		elif self.algorithm == 'double_q_learning':
			if self.network_type == 'duel':
				# Get argmax actions from advantage values
				select_actions = self._get_action_var(self.adv_net, next_state)
			else:
				# Get argmax actions from Q values
				select_actions = self._get_action_var(self.net, next_state)
			select_mask = T.eq(T.arange(self.num_action).reshape((1, -1))
								, select_actions.astype(theano.config.floatX))

			if self.tnet is not None:
				# Evaluate argmax actions on target network
				eval_values = lasagne.layers.get_output(self.tnet, next_state)
			else:
				# Evaluate argmax actions on online network
				# (the same as q_learning but slower)
				eval_values = lasagne.layers.get_output(self.net, next_state)

			bootstrap_values = T.sum(eval_values * select_mask
									, axis = 1, keepdims = True)

		terminal_floatX = terminal.astype(theano.config.floatX)
		target_values = reward + self.discount * \
			(T.ones_like(terminal_floatX) - terminal_floatX) * bootstrap_values

		if self.tnet is None:
			target_values = theano.gradient.disconnected_grad(target_values)

		error = target_values - current_values
		if self.max_error > 0:
			# From https://github.com/spragunr/deep_q_rl/issues/46
			quadratic_term = T.minimum(abs(error), self.max_error)
			linear_term = abs(error) - quadratic_term
			loss = T.sum(0.5 * quadratic_term ** 2 + linear_term * self.max_error)
		else:
			loss = T.sum(0.5 * error ** 2)

		net_params = lasagne.layers.get_all_params(self.net)
		updates = self._get_rmsprop_updates(loss, net_params
			, lr = Network.LEARNING_RATE, grad_momentum = Network.GRAD_MOMENTUM
			, sqr_momentum = Network.SGRAD_MOMENTUM
			, min_grad = Network.MIN_SGRAD)

		train_givens = {
			state : self.shared_states[:, :-1, :, :] / Network.INPUT_SCALE,
			action : self.shared_action,
			reward : self.shared_reward,
			terminal : self.shared_terminal,
			next_state : self.shared_states[:, 1:, :, :] / Network.INPUT_SCALE,
		}
		return theano.function([], loss, updates = updates, givens = train_givens)
示例#47
0
            
           if param_name in exp_params.keys():
              
              param_value = exp_params[param_name]
             
              break
          
    return param_value
########################################################################
if __name__ == "__main__":
    
    ### this block is for testing functions
    
    W = T.matrix('W')
    
    x = T.col('x')
    
    b = T.col('b')
    
    quadratic_form = energy_function(W, b, x)
    
    compute_quad_form = theano.function([W,x,b],quadratic_form)
    
    print(compute_quad_form([[1,2],[3,4]], [[1],[1]], [[1],[1]])[0][0] == 12)
    
    grad_W, grad_b = T.grad(quadratic_form[0][0], [W,b])
    
    comp_grad_W = theano.function([W,b,x], grad_W)
    
    comp_grad_b = theano.function([W,b,x], grad_b)
    
示例#48
0
    def __init__(self, n_in, n_out, state_bounds, action_bounds, reward_bound,
                 settings_):

        super(DeepCNNDropoutCritic,
              self).__init__(n_in, n_out, state_bounds, action_bounds,
                             reward_bound, settings_)

        # data types for model
        self._dropout_p = settings_['dropout_p']

        # data types for model
        self._State = T.matrix("State")
        self._State.tag.test_value = np.random.rand(self._batch_size,
                                                    self._state_length)
        self._ResultState = T.matrix("ResultState")
        self._ResultState.tag.test_value = np.random.rand(
            self._batch_size, self._state_length)
        self._Reward = T.col("Reward")
        self._Reward.tag.test_value = np.random.rand(self._batch_size, 1)
        self._Target = T.col("Target")
        self._Target.tag.test_value = np.random.rand(self._batch_size, 1)
        self._Action = T.matrix("Action")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)

        # create a small convolutional neural network
        input = lasagne.layers.InputLayer((None, self._state_length),
                                          self._State)
        self._stateInputVar = input.input_var
        inputAction = lasagne.layers.InputLayer((None, self._action_length),
                                                self._Action)
        self._actionInputVar = inputAction.input_var

        taskFeatures = lasagne.layers.SliceLayer(
            input,
            indices=slice(0, self._settings['num_terrain_features']),
            axis=1)
        characterFeatures = lasagne.layers.SliceLayer(
            input,
            indices=slice(self._settings['num_terrain_features'],
                          self._state_length),
            axis=1)
        print("taskFeatures Shape:",
              lasagne.layers.get_output_shape(taskFeatures))
        print("characterFeatures Shape:",
              lasagne.layers.get_output_shape(characterFeatures))
        print("State length: ", self._state_length)

        networkAct = lasagne.layers.InputLayer((None, self._state_length),
                                               self._State)

        # taskFeaturesAct = lasagne.layers.SliceLayer(networkAct, indices=slice(0, self._settings['num_terrain_features']), axis=1)
        # characterFeaturesAct = lasagne.layers.SliceLayer(networkAct, indices=slice(self._settings['num_terrain_features'],self._state_length), axis=1)

        # taskFeaturesAct = lasagne.layers.DropoutLayer(taskFeaturesAct, p=self._dropout_p, rescale=True)
        networkAct = lasagne.layers.ReshapeLayer(
            taskFeatures, (-1, 1, self._settings['num_terrain_features']))

        networkAct = lasagne.layers.Conv1DLayer(
            networkAct,
            num_filters=16,
            filter_size=8,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3)
        """
        networkAct = lasagne.layers.Conv1DLayer(
            networkAct, num_filters=32, filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)
        """
        networkAct = lasagne.layers.Conv1DLayer(
            networkAct,
            num_filters=8,
            filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())

        # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3)

        self._actor_task_part = networkAct
        """ 
        networkAct = lasagne.layers.Conv1DLayer(
            networkAct, num_filters=32, filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        
        
        networkAct = lasagne.layers.DenseLayer(
                networkAct, num_units=128,
                nonlinearity=lasagne.nonlinearities.leaky_rectify)
        """
        networkAct = lasagne.layers.FlattenLayer(networkAct, outdim=2)
        networkAct = lasagne.layers.ConcatLayer(
            [networkAct, characterFeatures], axis=1)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        networkAct = lasagne.layers.DenseLayer(
            networkAct,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        networkAct = lasagne.layers.DenseLayer(
            networkAct,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # networkAct = lasagne.layers.DropoutLayer(networkAct, p=self._dropout_p, rescale=True)

        self._actor = lasagne.layers.DenseLayer(
            networkAct,
            num_units=self._action_length,
            nonlinearity=lasagne.nonlinearities.linear)

        if (self._settings['use_stocastic_policy']):
            with_std = lasagne.layers.DenseLayer(
                networkAct,
                num_units=self._action_length,
                nonlinearity=theano.tensor.nnet.softplus)
            self._actor = lasagne.layers.ConcatLayer([self._actor, with_std],
                                                     axis=1)

        if (settings_['agent_name'] == 'algorithm.DPG.DPG'):
            characterFeatures = lasagne.layers.ConcatLayer(
                [characterFeatures, inputAction])

        # taskFeatures = lasagne.layers.DropoutLayer(taskFeatures, p=self._dropout_p, rescale=True)
        network = lasagne.layers.ReshapeLayer(
            taskFeatures, (-1, 1, self._settings['num_terrain_features']))

        network = lasagne.layers.Conv1DLayer(
            network,
            num_filters=16,
            filter_size=8,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3)
        """
        network = lasagne.layers.Conv1DLayer(
            network, num_filters=32, filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)
        """
        network = lasagne.layers.Conv1DLayer(
            network,
            num_filters=8,
            filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())

        self._critic_task_part = network
        """
        # network = lasagne.layers.MaxPool1DLayer(network, pool_size=3)
        
        network = lasagne.layers.Conv1DLayer(
            network, num_filters=32, filter_size=4,
            nonlinearity=lasagne.nonlinearities.leaky_rectify,
            W=lasagne.init.GlorotUniform())
        
        network = lasagne.layers.DenseLayer(
                network, num_units=128,
                nonlinearity=lasagne.nonlinearities.leaky_rectify)
        """
        network = lasagne.layers.FlattenLayer(network, outdim=2)
        network = lasagne.layers.ConcatLayer([network, characterFeatures],
                                             axis=1)
        # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=64,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=32,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        network = lasagne.layers.DropoutLayer(network,
                                              p=self._dropout_p,
                                              rescale=True)

        network = lasagne.layers.DenseLayer(
            network,
            num_units=16,
            nonlinearity=lasagne.nonlinearities.leaky_rectify)
        # network = lasagne.layers.DropoutLayer(network, p=self._dropout_p, rescale=True)

        self._critic = lasagne.layers.DenseLayer(
            network, num_units=1, nonlinearity=lasagne.nonlinearities.linear)

        self._states_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._next_states_shared = theano.shared(
            np.zeros((self._batch_size, self._state_length),
                     dtype=theano.config.floatX))

        self._rewards_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=theano.config.floatX),
                                             broadcastable=(False, True))

        self._target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=theano.config.floatX),
                                            broadcastable=(False, True))

        self._actions_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=theano.config.floatX), )
示例#49
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, network_type, update_rule,
                 batch_accumulator, input_scale=255.0, reward_bias=0.):

        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0

        self.l_out = self.build_network(network_type, input_width, input_height,
                                        num_actions, num_frames, batch_size)
        if self.freeze_interval > 0:
            self.next_l_out = self.build_network(network_type, input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.tensor4('states')
        next_states = T.tensor4('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

        self.states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.next_states_shared = theano.shared(
            np.zeros((batch_size, num_frames, input_height, input_width),
                     dtype=theano.config.floatX))

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards + reward_bias +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})
示例#50
0
文件: DPG.py 项目: skylbc/SMBAE
    def __init__(self, model, n_in, n_out, state_bounds, action_bounds,
                 reward_bound, settings_):
        """
            In order to get this to work we need to be careful not to update the actor parameters
            when updating the critic. This can be an issue when the Concatenating networks together.
            The first first network becomes a part of the second. However you can still access the first
            network by itself but an updates on the second network will effect the first network.
            Care needs to be taken to make sure only the parameters of the second network are updated.
        """

        super(DPG, self).__init__(model, n_in, n_out, state_bounds,
                                  action_bounds, reward_bound, settings_)

        self._Fallen = T.bcol("Fallen")
        ## because float64 <= float32 * int32, need to use int16 or int8
        self._Fallen.tag.test_value = np.zeros((self._batch_size, 1),
                                               dtype=np.dtype('int8'))

        self._fallen_shared = theano.shared(np.zeros((self._batch_size, 1),
                                                     dtype='int8'),
                                            broadcastable=(False, True))

        self._Action = T.matrix("Action2")
        self._Action.tag.test_value = np.random.rand(self._batch_size,
                                                     self._action_length)

        self._Tmp_Target = T.col("Tmp_Target")
        self._Tmp_Target.tag.test_value = np.zeros(
            (self._batch_size, 1),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._tmp_target_shared = theano.shared(np.zeros(
            (self._batch_size, 1), dtype=self.getSettings()['float_type']),
                                                broadcastable=(False, True))

        self._modelTarget = copy.deepcopy(model)

        # print ("Initial W " + str(self._w_o.get_value()) )

        self._learning_rate = self.getSettings()['learning_rate']
        self._discount_factor = self.getSettings()['discount_factor']
        self._rho = self.getSettings()['rho']
        self._rms_epsilon = self.getSettings()['rms_epsilon']

        self._weight_update_steps = self.getSettings(
        )['steps_until_target_network_update']
        self._updates = 0
        self._decay_weight = self.getSettings()['regularization_weight']
        self._critic_regularization_weight = self.getSettings(
        )["critic_regularization_weight"]
        self._critic_learning_rate = self.getSettings()["critic_learning_rate"]

        # self._q_valsA = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsA_drop = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)
        # self._q_valsNextState = lasagne.layers.get_output(self._model.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTargetNextState = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getResultStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=True)
        # self._q_valsTarget_drop = lasagne.layers.get_output(self._modelTarget.getCriticNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        self._q_valsActA = lasagne.layers.get_output(
            self._model.getActorNetwork(),
            self._model.getStateSymbolicVariable(),
            deterministic=True)
        self._q_valsActTarget = lasagne.layers.get_output(
            self._modelTarget.getActorNetwork(),
            self._model.getResultStateSymbolicVariable(),
            deterministic=True)
        # self._q_valsActA_drop = lasagne.layers.get_output(self._model.getActorNetwork(), self._model.getStateSymbolicVariable(), deterministic=False)

        inputs_1 = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._model.getActions()
        }
        self._q_valsA = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1)
        inputs_1_policy = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            self._model.getActionSymbolicVariable(): self._q_valsActA
        }
        self._q_vals_train_policy = lasagne.layers.get_output(
            self._model.getCriticNetwork(), inputs_1_policy)
        inputs_2 = {
            self._modelTarget.getStateSymbolicVariable():
            self._model.getResultStates(),
            self._modelTarget.getActionSymbolicVariable():
            self._model.getActions()
        }
        self._q_valsB_ = lasagne.layers.get_output(
            self._modelTarget.getCriticNetwork(), inputs_2, deterministic=True)

        self._q_func = self._q_valsA
        self._q_funcB = self._q_valsB_
        # self._q_funcTarget = self._q_valsTarget
        # self._q_func_drop = self._q_valsA_drop
        # self._q_funcTarget_drop = self._q_valsTarget_drop
        self._q_funcAct = self._q_valsActA
        # self._q_funcAct_drop = self._q_valsActA_drop

        # self._q_funcAct = theano.function(inputs=[State], outputs=self._q_valsActA, allow_input_downcast=True)

        # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen)
        self._diff = self._Tmp_Target - self._q_func
        # self._diff_drop = self._target - self._q_func_drop
        # loss = 0.5 * self._diff ** 2
        loss = T.pow(self._diff, 2)
        self._loss = T.mean(loss)
        # self._loss_drop = T.mean(0.5 * self._diff_drop ** 2)

        # assert len(lasagne.layers.helper.get_all_params(self._l_outA)) == 16
        # Need to remove the action layers from these params
        self._params = lasagne.layers.helper.get_all_params(
            self._model.getCriticNetwork())
        print("******Number of Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getCriticNetwork()))))
        print("******Number of Action Layers is: " + str(
            len(
                lasagne.layers.helper.get_all_params(
                    self._model.getActorNetwork()))))
        self._actionParams = lasagne.layers.helper.get_all_params(
            self._model.getActorNetwork())
        self._givens_ = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
            # self._Action:  self._q_valsActTarget,
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._Fallen: self._fallen_shared
            self._Tmp_Target:
            self._tmp_target_shared
        }
        self._actGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._Fallen: self._fallen_shared
            # self._tmp_diff: self._tmp_diff_shared
        }

        self._critic_regularization = (
            self._critic_regularization_weight *
            lasagne.regularization.regularize_network_params(
                self._model.getCriticNetwork(), lasagne.regularization.l2))

        ## MSE update
        self._value_grad = T.grad(self._loss + self._critic_regularization,
                                  self._params)
        print("Optimizing Value Function with ",
              self.getSettings()['optimizer'], " method")
        self._updates_ = lasagne.updates.adam(self._value_grad,
                                              self._params,
                                              self._critic_learning_rate,
                                              beta1=0.9,
                                              beta2=0.9,
                                              epsilon=self._rms_epsilon)

        self._givens_grad = {
            self._model.getStateSymbolicVariable():
            self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            self._model.getActionSymbolicVariable():
            self._model.getActions(),
        }

        ## Some cool stuff to backprop action gradients

        self._action_grad = T.matrix("Action_Grad")
        self._action_grad.tag.test_value = np.zeros(
            (self._batch_size, self._action_length),
            dtype=np.dtype(self.getSettings()['float_type']))

        self._action_grad_shared = theano.shared(
            np.zeros((self._batch_size, self._action_length),
                     dtype=self.getSettings()['float_type']))

        ### Maximize wrt q function

        self._action_mean_grads = T.grad(
            cost=None,
            wrt=self._actionParams,
            known_grads={self._q_valsActA: self._action_grad_shared}),
        print("Action grads: ", self._action_mean_grads[0])
        ## When passing in gradients it needs to be a proper list of gradient expressions
        self._action_mean_grads = list(self._action_mean_grads[0])
        # print ("isinstance(self._action_mean_grads, list): ", isinstance(self._action_mean_grads, list))
        # print ("Action grads: ", self._action_mean_grads)
        self._actionGRADUpdates = lasagne.updates.adam(
            self._action_mean_grads,
            self._actionParams,
            self._learning_rate,
            beta1=0.9,
            beta2=0.9,
            epsilon=self._rms_epsilon)

        self._actGradGivens = {
            self._model.getStateSymbolicVariable(): self._model.getStates(),
            # self._model.getResultStateSymbolicVariable(): self._model.getResultStates(),
            # self._model.getRewardSymbolicVariable(): self._model.getRewards(),
            # self._model.getActionSymbolicVariable(): self._model.getActions(),
            # self._Fallen: self._fallen_shared,
            # self._advantage: self._advantage_shared,
            # self._KL_Weight: self._kl_weight_shared
        }

        # theano.gradient.grad_clip(x, lower_bound, upper_bound) # // TODO
        # self._actionUpdates = lasagne.updates.adam(-T.mean(self._q_vals_train_policy) +
        #   (self._decay_weight * lasagne.regularization.regularize_network_params(
        #       self._model.getActorNetwork(), lasagne.regularization.l2)), self._actionParams,
        #           self._learning_rate, beta1=0.9, beta2=0.9, epsilon=self._rms_epsilon)

        if ('train_extra_value_function' in self.getSettings() and
            (self.getSettings()['train_extra_value_function'] == True)):
            self._valsA = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=True)
            self._valsA_drop = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=False)
            self._valsNextState = lasagne.layers.get_output(
                self._model._value_function,
                self._model.getResultStateSymbolicVariable(),
                deterministic=True)
            self._valsTargetNextState = lasagne.layers.get_output(
                self._modelTarget._value_function,
                self._model.getResultStateSymbolicVariable(),
                deterministic=True)
            self._valsTarget = lasagne.layers.get_output(
                self._modelTarget._value_function,
                self._model.getStateSymbolicVariable(),
                deterministic=True)

            # self._target = T.mul(T.add(self._model.getRewardSymbolicVariable(), T.mul(self._discount_factor, self._q_valsB )), self._Fallen)
            # self._target = self._model.getRewardSymbolicVariable() + ((self._discount_factor * self._q_valsTargetNextState ) * self._NotFallen) + (self._NotFallen - 1)
            self._v_target = self._model.getRewardSymbolicVariable() + (
                self._discount_factor * self._valsTargetNextState)
            self._v_diff = self._v_target - self._valsA
            # loss = 0.5 * self._diff ** 2
            loss_v = T.pow(self._v_diff, 2)
            self._v_loss = T.mean(loss_v)

            self._params_value = lasagne.layers.helper.get_all_params(
                self._model._value_function)
            self._givens_value = {
                self._model.getStateSymbolicVariable():
                self._model.getStates(),
                self._model.getResultStateSymbolicVariable():
                self._model.getResultStates(),
                self._model.getRewardSymbolicVariable():
                self._model.getRewards(),
                # self._NotFallen: self._NotFallen_shared
                # self._model.getActionSymbolicVariable(): self._actions_shared,
            }
            self._value_regularization = (
                self._critic_regularization_weight *
                lasagne.regularization.regularize_network_params(
                    self._model._value_function, lasagne.regularization.l2))

            self._value_grad = T.grad(
                self._v_loss + self._value_regularization, self._params_value)
            print("Optimizing Value Function with ",
                  self.getSettings()['optimizer'], " method")
            self._updates_value = lasagne.updates.adam(
                self._value_grad,
                self._params_value,
                self._critic_learning_rate,
                beta1=0.9,
                beta2=0.9,
                epsilon=self._rms_epsilon)
            ## TD update
        DPG.compile(self)
示例#51
0
d_rewards_var = TT.vector('d_rewards')
# policy.dist_info_sym returns a dictionary, whose values are symbolic expressions for quantities related to the
# distribution of the actions. For a Gaussian policy, it contains the mean and (log) standard deviation.
dist_info_vars = policy.dist_info_sym(observations_var)

surr = TT.sum(
    -dist.log_likelihood_sym_1traj_GPOMDP(actions_var, dist_info_vars) *
    d_rewards_var)

params = policy.get_params(trainable=True)

grad = theano.grad(surr, params)

eval_grad1 = TT.matrix('eval_grad0', dtype=grad[0].dtype)
eval_grad2 = TT.vector('eval_grad1', dtype=grad[1].dtype)
eval_grad3 = TT.col('eval_grad3', dtype=grad[2].dtype)
eval_grad4 = TT.vector('eval_grad4', dtype=grad[3].dtype)
eval_grad5 = TT.vector('eval_grad5', dtype=grad[4].dtype)

f_train = theano.function(
    inputs=[observations_var, actions_var, d_rewards_var], outputs=grad)
f_update = theano.function(
    inputs=[eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
    outputs=None,
    updates=sgd([eval_grad1, eval_grad2, eval_grad3, eval_grad4, eval_grad5],
                params,
                learning_rate=learning_rate))

alla = []
for i in range(10):
    if (load_policy):
示例#52
0
    def __init__(self, environment, rho, rms_epsilon, momentum, clip_delta, freeze_interval, batch_size, network_type, 
                 update_rule, batch_accumulator, randomState, DoubleQ=False, TheQNet=NN):
        """ Initialize environment
        
        """
        QNetwork.__init__(self,environment, batch_size)

        
        self.rho = rho
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval
        self._DoubleQ = DoubleQ
        self._randomState = randomState
        
        QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState)

        self.update_counter = 0
        
        states=[]   # list of symbolic variables for each of the k element in the belief state
                    # --> [ T.tensor4 if observation of element=matrix, T.tensor3 if vector, T.tensor 2 if scalar ]
        next_states=[] # idem than states at t+1 
        self.states_shared=[] # list of shared variable for each of the k element in the belief state
        self.next_states_shared=[] # idem that self.states_shared at t+1

        for i, dim in enumerate(self._input_dimensions):
            if len(dim) == 3:
                states.append(T.tensor4("%s_%s" % ("state", i)))
                next_states.append(T.tensor4("%s_%s" % ("next_state", i)))

            elif len(dim) == 2:
                states.append(T.tensor3("%s_%s" % ("state", i)))
                next_states.append(T.tensor3("%s_%s" % ("next_state", i)))
                
            elif len(dim) == 1:            
                states.append( T.matrix("%s_%s" % ("state", i)) )
                next_states.append( T.matrix("%s_%s" % ("next_state", i)) )
                
            self.states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
            self.next_states_shared.append(theano.shared(np.zeros((batch_size,) + dim, dtype=theano.config.floatX) , borrow=False))
        
        print("Number of observations per state: {}".format(len(self.states_shared)))
        print("For each observation, historySize + ponctualObs_i.shape: {}".format(self._input_dimensions))
                
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')
        thediscount = T.scalar(name='thediscount', dtype=theano.config.floatX)
        thelr = T.scalar(name='thelr', dtype=theano.config.floatX)
        
        QNet=TheQNet(self._batch_size, self._input_dimensions, self._n_actions, self._randomState)
        self.q_vals, self.params, shape_after_conv = QNet._buildDQN(states)
        
        print("Number of neurons after spatial and temporal convolution layers: {}".format(shape_after_conv))

        self.next_q_vals, self.next_params, shape_after_conv = QNet._buildDQN(next_states)
        self._resetQHat()

        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))
        
        
        if(self._DoubleQ==True):
            givens_next={}
            for i, x in enumerate(self.next_states_shared):
                givens_next[ states[i] ] = x

            self.next_q_vals_current_qnet=theano.function([], self.q_vals,
                                          givens=givens_next)

            next_q_curr_qnet = theano.clone(self.next_q_vals)

            argmax_next_q_vals=T.argmax(next_q_curr_qnet, axis=1, keepdims=True)

            max_next_q_vals=self.next_q_vals[T.arange(batch_size),argmax_next_q_vals.reshape((-1,))].reshape((-1, 1))


        else:
            max_next_q_vals=T.max(self.next_q_vals, axis=1, keepdims=True)


        T_ones_like=T.ones_like(T.ones_like(terminals) - terminals)

        target = rewards + T_ones_like * thediscount * max_next_q_vals

        q_val=self.q_vals[T.arange(batch_size), actions.reshape((-1,))].reshape((-1, 1))
        # Note : Strangely (target - q_val) lead to problems with python 3.5, theano 0.8.0rc and floatX=float32...
        diff = - q_val + target 

        if self.clip_delta > 0:
            # This loss function implementation is taken from
            # https://github.com/spragunr/deep_q_rl
            # If we simply take the squared clipped diff as our loss,
            # then the gradient will be zero whenever the diff exceeds
            # the clip bounds. To avoid this, we extend the loss
            # linearly past the clip point to keep the gradient constant
            # in that regime.
            # 
            # This is equivalent to declaring d loss/d q_vals to be
            # equal to the clipped diff, then backpropagating from
            # there, which is what the DeepMind implementation does.
            quadratic_part = T.minimum(abs(diff), self.clip_delta)
            linear_part = abs(diff) - quadratic_part
            loss_ind = 0.5 * quadratic_part ** 2 + self.clip_delta * linear_part
        else:
            loss_ind = 0.5 * diff ** 2

        if batch_accumulator == 'sum':
            loss = T.sum(loss_ind)
        elif batch_accumulator == 'mean':
            loss = T.mean(loss_ind)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))

        givens = {
            rewards: self.rewards_shared,
            actions: self.actions_shared, ## actions not needed!
            terminals: self.terminals_shared
        }
        
        for i, x in enumerate(self.states_shared):
            givens[ states[i] ] = x 
        for i, x in enumerate(self.next_states_shared):
            givens[ next_states[i] ] = x
                
                
        gparams=[]
        for p in self.params:
            gparam =  T.grad(loss, p)
            gparams.append(gparam)

        updates = []
        
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, self.params, gparams, thelr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            for i,(p, g) in enumerate(zip(self.params, gparams)):                
                acc = theano.shared(p.get_value() * 0.)
                acc_new = rho * acc + (1 - self.rho) * g ** 2
                gradient_scaling = T.sqrt(acc_new + self.rms_epsilon)
                g = g / gradient_scaling
                updates.append((acc, acc_new))
                updates.append((p, p - thelr * g))

        elif update_rule == 'sgd':
            for i, (param, gparam) in enumerate(zip(self.params, gparams)):
                updates.append((param, param - thelr * gparam))
        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))
    
        
        if(self._DoubleQ==True):
            self._train = theano.function([thediscount, thelr, next_q_curr_qnet], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        else:
            self._train = theano.function([thediscount, thelr], [loss, loss_ind, self.q_vals], updates=updates,
                                      givens=givens,
                                      on_unused_input='warn')
        givens2={}
        for i, x in enumerate(self.states_shared):
            givens2[ states[i] ] = x 

        self._q_vals = theano.function([], self.q_vals,
                                      givens=givens2,
                                      on_unused_input='warn')
示例#53
0
   def __init__(self, datasetPaths = None, keyPoints = None):
   
      self.xMax = 1024.0
      self.yMax = 576.0
      self.numKeyPoints = 68
      self.loadPicasaTubePickle()
   
      loadPrev = 1

      if loadPrev == 1:
         pkl_file = open('faceAlign2.pkl', 'rb')
         self.pose, self.landmarks, self.poseDict, self.images, self.poseCat = pickle.load(pkl_file)
         pkl_file.close()
      else:
         self.loadData()
         output = open('faceAlign2.pkl', 'wb')
         data = (self.pose, self.landmarks, self.poseDict, self.images, self.poseCat)
         pickle.dump(data, output)
         output.close()
         return 

      self.eeta = 0.0000001
      self.mu = theano.shared(10 * numpy.random.random((2*self.numKeyPoints, 1)))
      self.S = theano.shared(numpy.eye(2 * self.numKeyPoints))
      self.alpha = theano.shared(0.1 * numpy.ones((2 * self.numKeyPoints,1)))
      theano.config.compute_test_value = 'warn'
      oneCol = T.col('oneCol')
      oneCol.tag.test_value = numpy.ones((self.numKeyPoints,1))
      pi_t = T.col('pi_t')
      pi_t.tag.test_value = numpy.random.random((2*self.numKeyPoints,1))
      temp = numpy.random.random((3,3))
      #temp = numpy.zeros((3,3))
      temp[2,:] = [0,0,1]
      self.A_t = theano.shared(temp, name='A_t')
      #print_A_t = theano.printing.Print('r_t1')(A_t)
      z_t = T.col('z_t')
      z_t.tag.test_value = numpy.random.random((2*self.numKeyPoints,1))
      z_t1 = z_t.reshape((self.numKeyPoints, 2))

      pts = T.concatenate((z_t1, oneCol), axis=1)
#      pts = theano.printing.Print('pts')(pts)
      r_t = T.dot(self.A_t, pts.transpose()).transpose()
      r_t1 = r_t[:,0:2].reshape((2*self.numKeyPoints,1))
      #pi_tt = theano.printing.Print('pi_t before')(pi_t)
      diff = pi_t * (r_t1 - self.mu)
      difft = diff.reshape((1, 2 * self.numKeyPoints))
      #diff = theano.printing.Print('diff:')(diff)
      cost = T.max(T.dot(T.dot(difft,self.S),diff))
      #cost = theano.printing.Print('cost:')(cost)
      A_t_grad = T.grad(cost=cost, wrt=self.A_t)
      A_t_grad = T.basic.set_subtensor(A_t_grad[2,:],0)
      #A_t_grad = theano.printing.Print('r_t1')(A_t_grad)
      update = (self.A_t, self.A_t - self.eeta * A_t_grad)
      self.align = theano.function(inputs=[pi_t,z_t, oneCol],
                                   outputs=[self.A_t, cost],
                                   updates=[update],
                                   on_unused_input='warn',
                                   allow_input_downcast=True)
      
      #for numpy optimization
      A_t_ = T.matrix('A_t_')
      #A_t_.tag.test_value = temp
      #A_t_ = A_t_.reshape((3,3))
      A_t_.tag.test_value = temp
      #print_A_t = theano.printing.Print('r_t1')(A_t)
      r_t_ = T.dot(A_t_, pts.transpose()).transpose()
      r_t1_ = r_t_[:,0:2].reshape((2*self.numKeyPoints,1))
      #pi_tt = theano.printing.Print('pi_t before')(pi_t)
      diff_ = pi_t * (r_t1_ - self.mu)
      difft_ = diff_.reshape((1, 2 * self.numKeyPoints))
      
      #diff = theano.printing.Print('diff:')(diff)
      cost_1 = T.dot(T.dot(difft_,self.S),diff_)
      #cost_1 = theano.printing.Print('cost is:')(cost_1)
      cost_ = T.max(cost_1)
      
      A_t_grad_ = T.grad(cost=cost_, wrt=A_t_)
      A_t_grad_ = T.basic.set_subtensor(A_t_grad_[2,:],0)
      #A_t_grad_ = A_t_grad_.reshape((9,1))

      self.cost = theano.function(inputs=[A_t_, pi_t, z_t, oneCol],
                                  outputs=[cost_, A_t_grad_])
      i = T.iscalar('index')
      i.tag.test_value = 0
      subS = self.S[2*i:2*i+2, 2*i:2*i+2]
      #subS = theano.printing.Print('subS:')(self.S[2*i:2*i+2, 2*i:2*i+2])
      det = T.abs_(subS[0,0]*subS[1,1] - subS[0,1]*subS[1,0])
      subDiff = diff[(2*i):((2*i)+2)]
      subDifft = difft[0][(2*i):(2*i+2)]
      #intermed = theano.printing.Print('dotProd1:')(T.dot(subDifft,subS))
      intermed = T.dot(subDifft,subS)
      #intermed2 = theano.printing.Print('dotProd2:')(T.dot(intermed,subDiff))
      intermed2 = T.dot(intermed,subDiff)
      numrtr = T.exp(-0.5 * intermed2)
      k  = 2 
      dnmntr = T.sqrt((2**k) * det)
      q = numrtr/dnmntr
      temp = ((1 - self.alpha[2*i:2*i+2]) * q)/(self.alpha[2*i:2*i+2] + (1 - self.alpha[2*i:2*i+2]) * q)
      pi_t_out = T.basic.set_subtensor(pi_t[2*i:2*i+2], temp)
      self.q_pi_update = theano.function(inputs = [i, oneCol, pi_t, z_t], 
                                        outputs = [q,pi_t_out, r_t1],
                                        allow_input_downcast=True)
     
      self.train('12')
示例#54
0
    def __init__(self, input_width, input_height, num_actions,
                 num_frames, discount, learning_rate, rho,
                 rms_epsilon, momentum, clip_delta, freeze_interval,
                 batch_size, update_rule,
                 batch_accumulator, state_count, input_scale=255.0):
                     
        self.state_count=state_count
        self.input_width = input_width
        self.input_height = input_height
        self.num_actions = num_actions
        self.num_frames = num_frames
        self.batch_size = batch_size
        self.discount = discount
        self.rho = rho
        self.lr = learning_rate
        self.rms_epsilon = rms_epsilon
        self.momentum = momentum
        self.clip_delta = clip_delta
        self.freeze_interval = freeze_interval

        self.update_counter = 0
        
        self.l_out = self.build_nature_network_dnn(input_width, input_height,
                                        num_actions, num_frames, batch_size)
        
        if self.freeze_interval > 0:
            self.next_l_out = self.build_nature_network_dnn(input_width,
                                                 input_height, num_actions,
                                                 num_frames, batch_size)
            self.reset_q_hat()

        states = T.matrix('states')
        next_states = T.matrix('next_states')
        rewards = T.col('rewards')
        actions = T.icol('actions')
        terminals = T.icol('terminals')

#buferis inputu viso batch
        self.states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#buferis i koki state patenka visiem
        self.next_states_shared = theano.shared(
            np.zeros((batch_size, state_count),
                     dtype=theano.config.floatX))

#po 1 reward kiekvienam episode, tai kaip del atskiru veiksmu?
        self.rewards_shared = theano.shared(
            np.zeros((batch_size, 1), dtype=theano.config.floatX),
            broadcastable=(False, True))

#po 1 priimta action kiekvienam episode
        self.actions_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#?? turbut 0 ir 1, ar paskutine verte ar ne
        self.terminals_shared = theano.shared(
            np.zeros((batch_size, 1), dtype='int32'),
            broadcastable=(False, True))

#paima qvals ir nexxt qvals ir grazina skirtumus batchui, viskas tik pirmam kartui

        q_vals = lasagne.layers.get_output(self.l_out, states / input_scale)
        if self.freeze_interval > 0:
            next_q_vals = lasagne.layers.get_output(self.next_l_out,
                                                    next_states / input_scale)
        else:
            next_q_vals = lasagne.layers.get_output(self.l_out,
                                                    next_states / input_scale)
            next_q_vals = theano.gradient.disconnected_grad(next_q_vals)

        target = (rewards +
                  (T.ones_like(terminals) - terminals) *
                  self.discount * T.max(next_q_vals, axis=1, keepdims=True))
        diff = target - q_vals[T.arange(batch_size),
                               actions.reshape((-1,))].reshape((-1, 1))

#neaisku
        if self.clip_delta > 0:
            diff = diff.clip(-self.clip_delta, self.clip_delta)

        if batch_accumulator == 'sum':
            loss = T.sum(diff ** 2)
        elif batch_accumulator == 'mean':
            loss = T.mean(diff ** 2)
        else:
            raise ValueError("Bad accumulator: {}".format(batch_accumulator))


#
        params = lasagne.layers.helper.get_all_params(self.l_out)
        givens = {
            states: self.states_shared,
            next_states: self.next_states_shared,
            rewards: self.rewards_shared,
            actions: self.actions_shared,
            terminals: self.terminals_shared
        }
        if update_rule == 'deepmind_rmsprop':
            updates = deepmind_rmsprop(loss, params, self.lr, self.rho,
                                       self.rms_epsilon)
        elif update_rule == 'rmsprop':
            updates = lasagne.updates.rmsprop(loss, params, self.lr, self.rho,
                                              self.rms_epsilon)

        elif update_rule == 'adam':
            updates = lasagne.updates.adam(loss, params, self.lr, self.rho, self.rho,                                              self.rms_epsilon)
                                              
        elif update_rule == 'adagrad':
            updates = lasagne.updates.adagrad(loss, params, self.lr,
                                              self.rms_epsilon)
        elif update_rule == 'sgd':
            updates = lasagne.updates.sgd(loss, params, self.lr)
            
        elif update_rule == 'momentum':
            updates = lasagne.updates.momentum(loss, params, self.lr, self.momentum)

        else:
            raise ValueError("Unrecognized update: {}".format(update_rule))

        if self.momentum > 0:
            updates = lasagne.updates.apply_momentum(updates, None,
                                                     self.momentum)

        self._train = theano.function([], [loss, q_vals], updates=updates,
                                      givens=givens)
        self._q_vals = theano.function([], q_vals,
                                       givens={states: self.states_shared})