Exemplo n.º 1
0
    def __init__(self, K, vocab_size, num_chars, W_init, 
            nhidden, embed_dim, dropout, train_emb, char_dim, use_feat, gating_fn, 
            save_attn=False):
        self.nhidden = nhidden
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.train_emb = train_emb
        self.char_dim = char_dim
        self.learning_rate = LEARNING_RATE
        self.num_chars = num_chars
        self.use_feat = use_feat
        self.save_attn = save_attn
        self.gating_fn = gating_fn

        self.use_chars = self.char_dim!=0
        if W_init is None: W_init = lasagne.init.GlorotNormal().sample((vocab_size, self.embed_dim))

        doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \
                T.wtensor3('cand')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')
        feat_var = T.imatrix('feat')
        doc_toks, qry_toks= T.imatrix('dchars'), T.imatrix('qchars')
        tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask')
        cloze_var = T.ivector('cloze')
        self.inps = [doc_var, doc_toks, query_var, qry_toks, cand_var, target_var, docmask_var,
                qmask_var, tok_var, tok_mask, candmask_var, feat_var, cloze_var]

        self.predicted_probs, predicted_probs_val, self.network, W_emb, attentions = (
                self.build_network(K, vocab_size, W_init))

        self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean()
        self.eval_fn = lasagne.objectives.categorical_accuracy(self.predicted_probs, 
                target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean()
        eval_fn_val = lasagne.objectives.categorical_accuracy(predicted_probs_val, 
                target_var).mean()

        self.params = L.get_all_params(self.network, trainable=True)
        
        updates = lasagne.updates.adam(self.loss_fn, self.params, learning_rate=self.learning_rate)

        self.train_fn = theano.function(self.inps,
                [self.loss_fn, self.eval_fn, self.predicted_probs], 
                updates=updates,
                on_unused_input='warn')
        self.validate_fn = theano.function(self.inps, 
                [loss_fn_val, eval_fn_val, predicted_probs_val]+attentions,
                on_unused_input='warn')
Exemplo n.º 2
0
    def __init__(self, K, vocab_size, W_init, regularizer, rlambda, nhidden,
                 embed_dim, dropout, train_emb, subsample):
        self.nhidden = nhidden
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.train_emb = train_emb
        self.subsample = subsample
        norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1
        if W_init is None:
            W_init = lasagne.init.GlorotNormal().sample(
                (vocab_size, self.embed_dim))

        doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3(
            'quer'), T.wtensor3('cand')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')

        if rlambda > 0.:
            W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape)
        else:
            W_pert = W_init
        predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = self.build_network(
            K, vocab_size, doc_var, query_var, cand_var, docmask_var,
            qmask_var, candmask_var, W_pert)

        loss_fn = T.nnet.categorical_crossentropy(predicted_probs, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        eval_fn = lasagne.objectives.categorical_accuracy(
            predicted_probs, target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        eval_fn_val = lasagne.objectives.categorical_accuracy(
            predicted_probs_val, target_var).mean()

        params = L.get_all_params(self.doc_net, trainable=True) + \
                L.get_all_params(self.q_net, trainable=True)

        updates = lasagne.updates.adam(loss_fn,
                                       params,
                                       learning_rate=LEARNING_RATE)

        self.train_fn = theano.function([doc_var, query_var, cand_var, target_var, docmask_var, \
                qmask_var, candmask_var],
                [loss_fn, eval_fn, predicted_probs],
                updates=updates)
        self.validate_fn = theano.function([doc_var, query_var, cand_var, target_var, docmask_var, \
                qmask_var, candmask_var],
                [loss_fn_val, eval_fn_val, predicted_probs_val])
Exemplo n.º 3
0
    def __init__(self, model):
        """ Initialize the filtered stim model
        """
        self.model = model
        self.prms = model['network']['graph']
        N = model['N']

        self.rho = self.prms['rho'] * np.ones((N, N))

        if 'rho_refractory' in self.prms:
            self.rho[np.diag_indices(N)] = self.prms['rho_refractory']

        self.pA = theano.shared(value=self.rho, name='pA')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Allow for scaling the log likelihood of the graph so that we can do
        # Annealed importance sampling
        self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale')

        # Define log probability
        self.lkhd = T.sum(self.A * np.log(np.minimum(1.0 - 1e-8, self.rho)) +
                          (1 - self.A) *
                          np.log(np.maximum(1e-8, 1.0 - self.rho)))

        self.log_p = self.lkhd_scale * self.lkhd
Exemplo n.º 4
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)
Exemplo n.º 5
0
def BuildModel(modelSpecs, forTrain=True):
        rng = np.random.RandomState()

        ## x is for sequential features
        x = T.tensor3('x')

        ## mask for x 
        xmask = T.bmatrix('xmask')
        propertyPredictor = ResNet4Properties( rng, seqInput=x, mask_seq=xmask, modelSpecs=modelSpecs )

        ## labelList is a list of label matrices, each with shape (batchSize, seqLen, numLabels)
        labelList = []
        if forTrain:
                ## when this model is used for training. We need to define the label variable
		labelList = []
		for res in modelSpecs['responses']:
			labelType = Response2LabelType(res)
			if labelType.startswith('Discrete'):
                		labelList.append( T.itensor3('label4' + res ) )
			else:
                		labelList.append( T.tensor3('label4' + res ) )

        ## weightList is a list of label weight matices, each with shape (batchSize, seqLen, 1)
	## we always use weight to deal with residues without 3D coordinates
        weightList = []
        if len(labelList)>0:
                weightList = [ T.tensor3('weight4' + res ) for res in modelSpecs['responses'] ]

	if len(labelList)>0:
        	return propertyPredictor, x, xmask, labelList, weightList
	else:
        	return propertyPredictor, x, xmask
Exemplo n.º 6
0
    def __init__(self, nodes_per_layer, act_funcs, err_func, backprop_func, backprop_params,
                 l_rate=.001, batch_size=100):
        """
        layer_shape - number of nodes per layer, including input and output layers
        act_funcs - list activation functions between the layers
        err_func - cost/error function
        backprop_func - backpropagation function
        l_rate - Learning rate
        """
        assert len(nodes_per_layer)-1 == len(act_funcs), \
            ("Invalid number of activation functions compared to the number of hidden layers",
             len(nodes_per_layer), len(act_funcs))
        super(FFNet, self).__init__('FFNet', l_rate, batch_size)
        logging.info('\tConstructing FFNet with nodes per layer: %s, learning rate: %s ', nodes_per_layer, l_rate)

        input_data = T.fmatrix('X')
        input_labels = T.bmatrix('Y')
        layers = [input_data]

        # Generate initial random weights between each layer
        weights = []
        for i in range(len(nodes_per_layer)-1):
            weights.append(init_rand_weights((nodes_per_layer[i], nodes_per_layer[i+1])))
            weights[i].name = 'w' + str(i)

        # logging.debug('\tWeight layers: %s', len(weights))
        #logging.info('\tNumber of parameters to train: %s',
        #             sum(param.get_value(borrow=True, return_internal_type=True).size for param in weights))
        # Construct the layers with the given activation functions weights between them
        # logging.info('\tConstructing layers ...')

        for i in range(len(weights)):
            layers.append(self.model(layers[i], weights[i], act_funcs[i]))

        for i in range(1, len(layers)):
            layers[i].name = 'l' + str(i)

        output_layer = layers[-1]
        cost = err_func(output_layer, input_labels)
        updates = backprop_func(cost, weights, self.l_rate, **backprop_params)

        prediction = T.argmax(output_layer, axis=1)
        prediction_value = T.max(output_layer, axis=1)

        # logging.info('\tConstructing functions ...')
        self.trainer = theano.function(
            inputs=[input_data, input_labels],
            outputs=cost,
            updates=updates,
            name='Trainer',
            allow_input_downcast=True  # Allows float64 to be casted as float32, which is necessary in order to use GPU
        )
        self.predictor = theano.function(
            inputs=[input_data],
            outputs={'char_as_int': prediction,
                     'char_probability': prediction_value,
                     'output_layer': output_layer},
            name='Predictor',
            allow_input_downcast=True
        )
Exemplo n.º 7
0
    def __init__(self, model, latent):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.latent = latent
        self.prms = model['network']['graph']
        self.N = model['N']

        # Get the number of latent types (R) and the latent type vector (Y)
        self.type_name = self.prms['types']
        self.R = self.latent[self.type_name].R
        self.Y = self.latent[self.type_name].Y

        # A RxR matrix of connection probabilities per pair of clusters
        self.B = T.dmatrix('B')

        # For indexing, we also need Y as a column vector and tiled matrix
        self.Yv = T.reshape(self.Y, [self.N, 1])
        self.Ym = T.tile(self.Yv, [1, self.N])
        self.pA = self.B[self.Ym, T.transpose(self.Ym)]

        # Hyperparameters governing B and alpha
        self.b0 = self.prms['b0']
        self.b1 = self.prms['b1']

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Define log probability
        log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B))
        log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA))

        self.log_p = log_p_B + log_p_A
Exemplo n.º 8
0
    def __init__(self, model, latent):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.latent = latent
        self.prms = model['network']['graph']
        self.N = model['N']

        # Get the number of latent types (R) and the latent type vector (Y)
        self.type_name = self.prms['types']
        self.R = self.latent[self.type_name].R
        self.Y = self.latent[self.type_name].Y

        # A RxR matrix of connection probabilities per pair of clusters
        self.B = T.dmatrix('B')

        # For indexing, we also need Y as a column vector and tiled matrix
        self.Yv = T.reshape(self.Y, [self.N, 1])
        self.Ym = T.tile(self.Yv, [1, self.N])
        self.pA = self.B[self.Ym, T.transpose(self.Ym)]

        # Hyperparameters governing B and alpha
        self.b0 = self.prms['b0']
        self.b1 = self.prms['b1']

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Define log probability
        log_p_B = T.sum((self.b0 - 1) * T.log(self.B) +
                        (self.b1 - 1) * T.log(1 - self.B))
        log_p_A = T.sum(self.A * T.log(self.pA) +
                        (1 - self.A) * T.log(1 - self.pA))

        self.log_p = log_p_B + log_p_A
Exemplo n.º 9
0
    def __init__(self, model):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.prms = model['network']['graph']
        self.N = model['N']

        # SBM has R latent clusters
        self.R = self.prms['R']
        # A RxR matrix of connection probabilities per pair of clusters
        self.B = T.dmatrix('B')
        # SBM has a latent block or cluster assignment for each node
        self.Y = T.lvector('Y')
        # For indexing, we also need Y as a column vector and tiled matrix
        self.Yv = T.reshape(self.Y, [self.N, 1])
        self.Ym = T.tile(self.Yv, [1, self.N])
        self.pA = self.B[self.Ym, T.transpose(self.Ym)]

        # A probability of each cluster
        self.alpha = T.dvector('alpha')

        # Hyperparameters governing B and alpha
        self.b0 = self.prms['b0']
        self.b1 = self.prms['b1']
        self.alpha0 = self.prms['alpha0']

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Define log probability
        log_p_B = T.sum((self.b0 - 1) * T.log(self.B) + (self.b1 - 1) * T.log(1 - self.B))
        log_p_alpha = T.sum((self.alpha0 - 1) * T.log(self.alpha))
        log_p_A = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA))

        self.log_p = log_p_B + log_p_alpha + log_p_A
Exemplo n.º 10
0
 def test_illegal_things(self):
     i0 = TT.iscalar()
     i1 = TT.lvector()
     i2 = TT.bmatrix()
     self.failUnlessRaises(TypeError, FAS, [i1, slice(None, i2, -1), i0])
     self.failUnlessRaises(TypeError, FAS, [i1, slice(None, None, i2), i0])
     self.failUnlessRaises(TypeError, FAS, [i1, slice(i2, None, -1), i0])
Exemplo n.º 11
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], [a + b + c], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], [out_op(a, b, c)], mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    f(a_v, b_v, c_v)
Exemplo n.º 12
0
    def __init__(self, model):
        """ Initialize the filtered stim model
        """
        self.model = model
        self.prms = model['network']['graph']
        N = model['N']

        self.rho = self.prms['rho'] * np.ones((N, N))

        if 'rho_refractory' in self.prms:
            self.rho[np.diag_indices(N)] = self.prms['rho_refractory']

        self.pA = theano.shared(value=self.rho, name='pA')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Allow for scaling the log likelihood of the graph so that we can do
        # Annealed importance sampling
        self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale')


        # Define log probability
        self.lkhd = T.sum(self.A * np.log(np.minimum(1.0-1e-8, self.rho)) +
                           (1 - self.A) * np.log(np.maximum(1e-8, 1.0 - self.rho)))

        self.log_p = self.lkhd_scale * self.lkhd
Exemplo n.º 13
0
def ndim_btensor(ndim, name=None):
    if ndim == 2:
        return T.bmatrix(name)
    elif ndim == 3:
        return T.btensor3(name)
    elif ndim == 4:
        return T.btensor4(name)
    return T.imatrix(name)
Exemplo n.º 14
0
def ndim_btensor(ndim, name=None):
    if ndim == 2:
        return T.bmatrix(name)
    elif ndim == 3:
        return T.btensor3(name)
    elif ndim == 4:
        return T.btensor4(name)
    return T.imatrix(name)
Exemplo n.º 15
0
    def __init__(self, model, latent):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.prms = model['network']['graph']
        self.N = model['N']
        self.N_dims = self.prms['N_dims']

        # Get the latent location
        self.location = latent[self.prms['locations']]
        self.Lm = self.location.Lm
        # self.location_prior = create_prior(self.prms['location_prior'])
        #
        # # Latent distance model has NxR matrix of locations L
        # self.L = T.dvector('L')
        # self.Lm = T.reshape(self.L, (self.N, self.N_dims))

        # Compute the distance between each pair of locations
        # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite
        # broadcasting in order to subtract the two matrices
        L1 = self.Lm.dimshuffle(0, 'x', 1)  # Nx1xD
        L2 = self.Lm.dimshuffle('x', 0, 1)  # 1xNxD
        T.addbroadcast(L1, 1)
        T.addbroadcast(L2, 0)
        #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2))
        #self.D = T.sum((L1-L2)**2, axis=2)

        # It seems we need to use L1 norm for now because
        # Theano doesn't properly compute the gradients of the L2
        # norm. (It gives NaNs because it doesn't realize that some
        # terms will cancel out)
        # self.D = (L1-L2).norm(1, axis=2)
        self.D = T.pow(L1 - L2, 2).sum(axis=2)

        # There is a distance scale, \delta
        self.delta = T.dscalar(name='delta')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # The probability of A is exponentially decreasing in delta
        # self.pA = T.exp(-1.0*self.D/self.delta)
        self.pA = T.exp(-0.5 * self.D / self.delta**2)

        if 'rho_refractory' in self.prms:
            self.pA += T.eye(self.N) * (self.prms['rho_refractory'] - self.pA)
            # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory']

        # Allow for scaling the log likelihood of the graph so that we can do
        # Annealed importance sampling
        self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale')

        # Define log probability
        self.lkhd = T.sum(self.A * T.log(self.pA) +
                          (1 - self.A) * T.log(1 - self.pA))
        # self.log_p = self.lkhd_scale * self.lkhd + self.location_prior.log_p(self.Lm)
        self.log_p = self.lkhd_scale * self.lkhd
Exemplo n.º 16
0
    def __init__(self, model, latent):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.prms = model['network']['graph']
        self.N = model['N']
        self.N_dims = self.prms['N_dims']

        # Get the latent location
        self.location = latent[self.prms['locations']]
        self.Lm = self.location.Lm
        # self.location_prior = create_prior(self.prms['location_prior'])
        #
        # # Latent distance model has NxR matrix of locations L
        # self.L = T.dvector('L')
        # self.Lm = T.reshape(self.L, (self.N, self.N_dims))

        # Compute the distance between each pair of locations
        # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite
        # broadcasting in order to subtract the two matrices
        L1 = self.Lm.dimshuffle(0,'x',1)     # Nx1xD
        L2 = self.Lm.dimshuffle('x',0,1)     # 1xNxD
        T.addbroadcast(L1,1)
        T.addbroadcast(L2,0)
        #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2))
        #self.D = T.sum((L1-L2)**2, axis=2)

        # It seems we need to use L1 norm for now because
        # Theano doesn't properly compute the gradients of the L2
        # norm. (It gives NaNs because it doesn't realize that some
        # terms will cancel out)
        # self.D = (L1-L2).norm(1, axis=2)
        self.D = T.pow(L1-L2,2).sum(axis=2)

        # There is a distance scale, \delta
        self.delta = T.dscalar(name='delta')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # The probability of A is exponentially decreasing in delta
        # self.pA = T.exp(-1.0*self.D/self.delta)
        self.pA = T.exp(-0.5*self.D/self.delta**2)

        if 'rho_refractory' in self.prms:
            self.pA += T.eye(self.N) * (self.prms['rho_refractory']-self.pA)
            # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory']

        # Allow for scaling the log likelihood of the graph so that we can do
        # Annealed importance sampling
        self.lkhd_scale = theano.shared(value=1.0, name='lkhd_scale')

        # Define log probability
        self.lkhd = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA))
        # self.log_p = self.lkhd_scale * self.lkhd + self.location_prior.log_p(self.Lm)
        self.log_p = self.lkhd_scale * self.lkhd
 def make_node(self, state, time):
     """
     make node ...
     :param state:
     :param time:
     :return:
     """
     state = T.as_tensor_variable(state)
     time = T.as_tensor_variable(time)
     return theano.Apply(self, [state, time], [T.bmatrix()])
Exemplo n.º 18
0
    def test3_ndarray(self):

        i0 = TT.iscalar()
        i1 = TT.lvector()
        i2 = TT.bmatrix()
        
        f = FAS([i1, slice(None, i0, -1), i2])
        assert f.n_in == 4
        assert f.idx_tuple == (i1.type, slice(0, i0.type, -1), i2.type,)
        assert f.view_map == {}
Exemplo n.º 19
0
 def test_any_grad(self):
     x = tensor.bmatrix("x")
     x_all = x.any()
     gx = theano.grad(x_all, x)
     f = theano.function([x], gx)
     x_random = self.rng.binomial(n=1, p=0.5, size=(5, 7)).astype("int8")
     for x_val in (x_random, numpy.zeros_like(x_random), numpy.ones_like(x_random)):
         gx_val = f(x_val)
         assert gx_val.shape == x_val.shape
         assert numpy.all(gx_val == 0)
Exemplo n.º 20
0
 def use_target(self, target, dtype):
   if target in self.y: return
   if target == "null": return
   if target == 'sizes' and not 'sizes' in self.n_out: #TODO(voigtlaender): fix data please
     self.n_out['sizes'] = [2,1]
   if self.base_network:
     self.base_network.use_target(target=target, dtype=dtype)
     if not self.y is self.base_network.y:
       self.y[target] = self.base_network.y[target]
     if not self.j is self.base_network.j:
       self.j[target] = self.base_network.j[target]
     if target not in self.n_out:
       self.n_out[target] = self.base_network.n_out[target]
     return
   if target.endswith("[sparse:coo]"):
     tprefix = target[:target.index("[")]
     ndim = self.n_out[target][1]  # expected (without batch), e.g. 2 if like (time,feature)
     # For each coordinate axe. Also with batch-dim.
     for i in range(ndim):
       self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, i)] = T.TensorType("int32", (False,) * 2)('y_%s[sparse:coo:%i:%i]' % (tprefix, ndim, i))
     # And the data itself. Also with batch-dim.
     self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, ndim)] = \
       T.TensorType(dtype, (False,) * 2)("y_%s[%i]" % (tprefix, ndim))
     # self.j will be used to get the list of keys we need to get from the dataset.
     for i in range(ndim + 1):
       self.j.setdefault("%s[sparse:coo:%i:%i]" % (tprefix, ndim, i), T.bmatrix('j_%s[sparse:coo:%i:%i]' % (tprefix, ndim, i)))
     # self.y[target] will be given to the OutputLayer.
     self.y[target] = tuple(self.y["%s[sparse:coo:%i:%i]" % (tprefix, ndim, i)] for i in range(ndim + 1))
     self.j[target] = self.j["data"]  # Not sure if this is the best we can do...
     return
   assert target in self.n_out
   ndim = self.n_out[target][1] + 1  # one more because of batch-dim
   self.y[target] = T.TensorType(dtype, (False,) * ndim)('y_%s' % target)
   self.y[target].n_out = self.n_out[target][0]
   self.j.setdefault(target, T.bmatrix('j_%s' % target))
   if getattr(self.y[target].tag, "test_value", None) is None:
     if ndim == 2:
       self.y[target].tag.test_value = numpy.zeros((3,2), dtype='int32')
     elif ndim == 3:
       self.y[target].tag.test_value = numpy.random.rand(3,2,self.n_out[target][0]).astype('float32')
   if getattr(self.j[target].tag, "test_value", None) is None:
     self.j[target].tag.test_value = numpy.ones((3,2), dtype="int8")
Exemplo n.º 21
0
    def __init__(self, layers, err_func, backprop_func, backprop_params,
                 l_rate, batch_size=10):
        """
        :param layers:
        :param err_func: cost/error function
        :param backprop_func: backpropagation function
        :param backprop_params: parameters to pass to backprop function
        :param l_rate: learning rate
        :param batch_size: (mini-) batch size. In comparison to regular nets
        :return:
        """
        super(ConvNet, self).__init__("ConvNet", l_rate, batch_size)
        logging.info('\tConstructing ConvNet with %s layers. Learning rate: %s. Batch size: %s ',
                     len(layers), l_rate, batch_size)

        input_data = T.fmatrix('X')
        input_labels = T.bmatrix('Y')

        params = []  # Regular weights and bias weights; e.g. everything to be adjusted during training
        for layer in layers:
            for param in layer.params:
                params.append(param)
        logging.info('\tNumber of parameters to train: %s',
                     sum(param.get_value(borrow=True, return_internal_type=True).size for param in params))

        layers[0].activate(input_data, self.batch_size)
        for i in range(1, len(layers)):
            prev_layer = layers[i-1]
            current_layer = layers[i]
            current_layer.activate(prev_layer.output(), self.batch_size)

        output_layer = layers[-1].output_values
        cost = err_func(output_layer, input_labels)
        updates = backprop_func(cost, params, l_rate, **backprop_params)

        prediction = T.argmax(output_layer, axis=1)
        prediction_value = T.max(output_layer, axis=1)

        logging.debug('\tConstructing functions ...')
        self.trainer = theano.function(
            inputs=[input_data, input_labels],
            outputs=cost,
            updates=updates,
            name='Trainer',
            allow_input_downcast=True  # Allows float64 to be casted as float32, which is necessary in order to use GPU
        )
        self.predictor = theano.function(
            inputs=[input_data],
            outputs={'char_as_int': prediction,
                     'char_probability': prediction_value,
                     'output_layer': output_layer},
            name='Predictor',
            allow_input_downcast=True
        )
Exemplo n.º 22
0
    def __init__(self, nin, nout, nhid, numpy_rng, scale=1.0):
        self.nin = nin
        self.nout = nout
        self.nhid = nhid
        self.numpy_rng = numpy_rng
        self.scale = np.float32(scale)

        self.inputs = T.fmatrix('inputs')
        self.inputs.tag.test_value = numpy_rng.uniform(
            low=-1., high=1.,
            size=(16, 5 * self.nin)
        ).astype(np.float32)
        self.targets = T.fmatrix('targets')
        self.targets.tag.test_value = np.ones(
            (16, 5 * nout), dtype=np.float32)
        self.masks = T.bmatrix('masks')
        self.masks.tag.test_value = np.ones(
            (16, 5), dtype=np.int8)
        self.batchsize = self.inputs.shape[0]

        self.inputs_frames = self.inputs.reshape((
            self.batchsize, self.inputs.shape[1] / nin,
            nin)).dimshuffle(1, 0, 2)
        self.targets_frames = self.targets.reshape((
            self.batchsize, self.targets.shape[1] / nout,
            nout)).dimshuffle(1, 0, 2)
        self.masks_frames = self.masks.T

        self.h0 = theano.shared(value=np.ones(
            nhid, dtype=theano.config.floatX) * np.float32(.5), name='h0')
        self.win = theano.shared(value=self.numpy_rng.normal(
            loc=0, scale=0.001, size=(nin, nhid)
        ).astype(theano.config.floatX), name='win')
        self.wrnn = theano.shared(value=self.scale * np.eye(
            nhid, dtype=theano.config.floatX), name='wrnn')
        self.wout = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01, size=(nhid, nout)
        ).astype(theano.config.floatX), name='wout')
        self.bout = theano.shared(value=np.zeros(
            nout, dtype=theano.config.floatX), name='bout')

        self.params = [self.win, self.wrnn, self.wout, self.bout]

        (self.hiddens, self.outputs), self.updates = theano.scan(
            fn=self.step, sequences=self.inputs_frames,
            outputs_info=[T.alloc(
                self.h0, self.batchsize, self.nhid), None])

        self._stepcosts = T.sum((self.targets_frames - self.outputs)**2, axis=2)
        self._cost = T.switch(self.masks_frames > 0, self._stepcosts, 0).mean()
        self._grads = T.grad(self._cost, self.params)

        self.getoutputs = theano.function(
            [self.inputs], self.outputs)
Exemplo n.º 23
0
 def test_any_grad(self):
     x = tensor.bmatrix('x')
     x_all = x.any()
     gx = theano.grad(x_all, x)
     f = theano.function([x], gx)
     x_random = self.rng.binomial(n=1, p=0.5, size=(5, 7)).astype('int8')
     for x_val in (x_random,
                   numpy.zeros_like(x_random),
                   numpy.ones_like(x_random)):
         gx_val = f(x_val)
         assert gx_val.shape == x_val.shape
         assert numpy.all(gx_val == 0)
Exemplo n.º 24
0
def run(gates,
        num_registers,
        max_int,
        num_timesteps,
        num_layers,
        reg_lambda,
        params,
        clip_gradients=None):
    params = make_broadcastable(params, clip_gradients=clip_gradients)

    # Create symbolic variables for the input to the machine
    # and for the desired output of the machine.
    initial_mem = dtensor3("InMem")
    desired_mem = imatrix("OutMem")
    cost_mask = bmatrix("CostMask")
    entropy_weight = dscalar("EntropyWeight")

    # Initialize all registers to zero. Instead of using to_one_hot,
    # create the shape directly; it's simpler this way.
    initial_registers = zeros((initial_mem.shape[0], num_registers, max_int),
                              dtype='float64')
    initial_registers = set_subtensor(initial_registers[:, :, 0], 1.0)

    # Run the model for all timesteps. The arguments are
    # registers, memory, cost, cumulative probability complete,
    # and probability incomplete. The latter are initialized
    # to zero and to one, respectively.
    v0 = as_tensor(0)
    v1 = as_tensor(1)
    output = (initial_registers, initial_mem, v0, v0, v1)
    debug = {}
    for timestep in range(num_timesteps):
        debug_local, output = step_cost(gates, max_int, desired_mem, cost_mask,
                                        num_timesteps, num_registers,
                                        num_layers, entropy_weight,
                                        timestep + 1, *output, params)
        debug.update(
            ("%d:%s" % (timestep, k), v) for (k, v) in debug_local.items())

    # Add in regularization, to avoid overfitting simple examples.
    reg_cost = reg_lambda * sum((p * p).sum() for p in params)
    debug['cost-regularization'] = reg_cost

    # Get the final cost: regularization plus loss.
    final_cost = reg_cost + output[2].sum()
    debug['cost-final'] = final_cost

    # Return the symbolic variables, the final cost, and the
    # intermediate register values for analysis and prediction.
    mem = output[1]
    return debug, initial_mem, desired_mem, cost_mask, mem, final_cost, entropy_weight
def BuildModel(modelSpecs, forTrain=True):
	rng = np.random.RandomState()

	## x is for sequential features and y for matrix (or pairwise) features
	x = T.tensor3('x')
	y = T.tensor4('y')

	## mask for x and y, respectively
	xmask = T.bmatrix('xmask')
	ymask = T.btensor3('ymask')

	xem = None
	##if any( k in modelSpecs['seq2matrixMode'] for k in ('SeqOnly', 'Seq+SS') ):
	if config.EmbeddingUsed(modelSpecs):
		xem = T.tensor3('xem')
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   embedInput=xem, modelSpecs=modelSpecs )
	else:
		distancePredictor = ResNet4DistMatrix( rng, seqInput=x,
											   matrixInput=y, mask_seq=xmask, mask_matrix=ymask,
											   modelSpecs=modelSpecs )

	## labelList is a list of label tensors, each having shape (batchSize, seqLen, seqLen) or (batchSize, seqLen, seqLen, valueDims[response] )
	labelList = []
	if forTrain:
		## when this model is used for training. We need to define the label variable
		for response in modelSpecs['responses']:
			labelType = Response2LabelType(response)
			rValDims = config.responseValueDims[labelType]

			if labelType.startswith('Discrete'):
				if rValDims > 1:
				## if one response is a vector, then we use a 4-d tensor
				## wtensor is for 16bit integer
					labelList.append( T.wtensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.wtensor3('Tlabel4' + response ) )
			else:
				if rValDims > 1:
					labelList.append( T.tensor4('Tlabel4' + response ) )
				else:
					labelList.append( T.tensor3('Tlabel4' + response ) )

	## weightList is a list of label weight tensors, each having shape (batchSize, seqLen, seqLen)
	weightList = []
	if len(labelList)>0 and modelSpecs['UseSampleWeight']:
		weightList = [ T.tensor3('Tweight4'+response) for response in modelSpecs['responses'] ]

	## for prediction, both labelList and weightList are empty
	return distancePredictor, x, y, xmask, ymask, xem, labelList, weightList
Exemplo n.º 26
0
    def __init__(self, nin, nout, nhid, numpy_rng, scale=1.0):
        self.nin = nin
        self.nout = nout
        self.nhid = nhid
        self.numpy_rng = numpy_rng
        self.theano_rng = RandomStreams(1)
        self.scale = np.float32(scale)

        self.inputs = T.fmatrix('inputs')
        self.targets = T.imatrix('targets')
        self.masks = T.bmatrix('masks')
        self.batchsize = self.inputs.shape[0]

        self.inputs_frames = self.inputs.reshape((
            self.batchsize, self.inputs.shape[1]/nin, nin)).dimshuffle(1,0,2)
        self.targets_frames = self.targets.T
        self.masks_frames = self.masks.T

        self.win = theano.shared(value=self.numpy_rng.normal(
            loc=0, scale=0.001, size=(nin, nhid)
        ).astype(theano.config.floatX), name='win')
        self.wrnn = theano.shared(value=self.scale * np.eye(
            nhid, dtype=theano.config.floatX), name='wrnn')
        self.wout = theano.shared(value=self.numpy_rng.uniform(
            low=-0.01, high=0.01, size=(nhid, nout)
        ).astype(theano.config.floatX), name='wout')
        self.bout = theano.shared(value=np.zeros(
            nout, dtype=theano.config.floatX), name='bout')

        self.params = [self.win, self.wrnn, self.wout, self.bout]

        (self.hiddens, self.outputs), self.updates = theano.scan(
            fn=self.step, sequences=self.inputs_frames,
            outputs_info=[self.theano_rng.uniform(low=0, high=1, size=(
                self.batchsize, nhid), dtype=theano.config.floatX), None])

        self.probabilities = T.nnet.softmax(self.outputs.reshape((
            self.outputs.shape[0] * self.outputs.shape[1],
            self.nout)))
        self.probabilities = T.clip(self.probabilities, 1e-6, 1-1e-6)

        self._stepcosts = T.nnet.categorical_crossentropy(
            self.probabilities, self.targets_frames.flatten()).reshape(
                self.targets_frames.shape)

        self._cost = T.switch(T.gt(self.masks_frames, 0), self._stepcosts, 0).mean()
        self._grads = T.grad(self._cost, self.params)

        self.get_classifications = theano.function(
            [self.inputs], T.argmax(self.probabilities.reshape(self.outputs.shape), axis=2).T)
Exemplo n.º 27
0
def main(config, tr_stream):
    # Create Theano variables
    logger.info('Creating theano variables')
    source_char_seq = tensor.lmatrix('source_char_seq')
    source_sample_matrix = tensor.btensor3('source_sample_matrix')
    source_char_aux = tensor.bmatrix('source_char_aux')
    source_word_mask = tensor.bmatrix('source_word_mask')
    target_char_seq = tensor.lmatrix('target_char_seq')
    target_char_aux = tensor.bmatrix('target_char_aux')
    target_char_mask = tensor.bmatrix('target_char_mask')
    target_sample_matrix = tensor.btensor3('target_sample_matrix')
    target_word_mask = tensor.bmatrix('target_word_mask')
    target_resample_matrix = tensor.btensor3('target_resample_matrix')
    target_prev_char_seq = tensor.lmatrix('target_prev_char_seq')
    target_prev_char_aux = tensor.bmatrix('target_prev_char_aux')
    target_bos_idx = tr_stream.trg_bos
    target_space_idx = tr_stream.space_idx['target']
    src_vocab = pickle.load(open(config['src_vocab'], 'rb'))

    logger.info('Building RNN encoder-decoder')
    encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'],
                                   config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth'])

    decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'],
                      config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'],
                      config['trg_dgru_depth'], target_space_idx, target_bos_idx)

    representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux,
                                   source_word_mask)
    cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix,
                        target_resample_matrix, target_char_aux, target_char_mask,
                        target_word_mask, target_prev_char_seq, target_prev_char_aux)

    # Set up model
    logger.info("Building model")
    training_model = Model(cost)

    # Set extensions
    logger.info("Initializing extensions")
    # Reload model if necessary
    extensions = [LoadNMT(config['saveto'])]

    # Initialize main loop
    logger.info("Initializing main loop")
    main_loop = MainLoop(
        model=training_model,
        algorithm=None,
        data_stream=None,
        extensions=extensions
    )

    for extension in main_loop.extensions:
        extension.main_loop = main_loop
    main_loop._run_extensions('before_training')

    char_embedding = encoder.decimator.apply(source_char_seq.T, source_sample_matrix, source_char_aux.T)
    embedding(Model(char_embedding), src_vocab)
Exemplo n.º 28
0
    def __init__(self, K, vocab_size, W_init=lasagne.init.GlorotNormal()):

        doc_var, query_var = T.itensor3('doc'), T.itensor3('quer')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')
        feat_var = T.bmatrix('feat')

        predicted_probs, predicted_probs_val, self.doc_net, self.q_net = self.build_network(K, \
                vocab_size, doc_var, query_var, docmask_var, qmask_var, candmask_var, feat_var, \
                W_init)

        loss_fn = T.nnet.categorical_crossentropy(predicted_probs,
                                                  target_var).mean()
        eval_fn = lasagne.objectives.categorical_accuracy(
            predicted_probs, target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val,
                                                      target_var).mean()
        eval_fn_val = lasagne.objectives.categorical_accuracy(
            predicted_probs_val, target_var).mean()

        params = L.get_all_params(self.doc_net, trainable=True) + L.get_all_params(self.q_net, \
                trainable=True)

        updates = lasagne.updates.adam(loss_fn,
                                       params,
                                       learning_rate=LEARNING_RATE)

        self.train_fn = theano.function([doc_var, query_var, target_var, docmask_var, qmask_var, \
                candmask_var, feat_var],
                [loss_fn, eval_fn, predicted_probs],
                updates=updates)
        self.validate_fn = theano.function([doc_var, query_var, target_var, docmask_var, qmask_var, \
                candmask_var, feat_var],
                [loss_fn_val, eval_fn_val, predicted_probs_val])
Exemplo n.º 29
0
    def __init__(self,
                 vocab_size,
                 num_classes,
                 W_init=lasagne.init.GlorotNormal()):

        doc_var, query_var = T.imatrix('doc'), T.imatrix('quer')
        docmask_var, qmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask')
        target_var = T.imatrix('ans')
        feat_var = T.bmatrix('feat')

        self.inps = [
            doc_var, query_var, target_var, docmask_var, qmask_var, feat_var
        ]
        loss, self.params, test_out = self.build_network(vocab_size, num_classes, \
                W_init, *self.inps)

        loss = loss + REGULARIZATION*lasagne.regularization.apply_penalty(self.params, \
                lasagne.regularization.l2)
        updates = lasagne.updates.rmsprop(loss, self.params, learning_rate=LEARNING_RATE, \
                rho=0.95, epsilon=0.0001)
        self.train_fn = theano.function(self.inps, loss, updates=updates)
        self.validate_fn = theano.function(self.inps,
                                           test_out,
                                           on_unused_input='warn')
Exemplo n.º 30
0
    def __init__(self, name, config):
        super().__init__(name)
        self.config = config

        self.param('src_embeddings',
                   (len(config['src_encoder']), config['src_embedding_dims']),
                   init_f=Gaussian(fan_in=config['src_embedding_dims']))
        self.param('trg_embeddings',
                   (len(config['trg_encoder']), config['trg_embedding_dims']),
                   init_f=Gaussian(fan_in=config['trg_embedding_dims']))
        self.add(Linear('hidden',
                        config['decoder_state_dims'],
                        config['trg_embedding_dims']))
        self.add(Linear('emission',
                        config['trg_embedding_dims'],
                        len(config['trg_encoder']),
                        w=self._trg_embeddings.T))
        for prefix, backwards in (('fwd', False), ('back', True)):
            self.add(Sequence(
                prefix+'_encoder', LSTM, backwards,
                config['src_embedding_dims'] + (
                    config['encoder_state_dims'] if backwards else 0),
                config['encoder_state_dims'],
                layernorm=config['encoder_layernorm'],
                dropout=config['encoder_dropout'],
                trainable_initial=True,
                offset=0))
        self.add(Sequence(
            'decoder', LSTM, False,
            config['trg_embedding_dims'],
            config['decoder_state_dims'],
            layernorm=config['decoder_layernorm'],
            dropout=config['decoder_dropout'],
            attention_dims=config['attention_dims'],
            attended_dims=2*config['encoder_state_dims'],
            trainable_initial=False,
            offset=-1))

        h_t = T.matrix('h_t')
        self.predict_fun = function(
                [h_t],
                T.nnet.softmax(self.emission(T.tanh(self.hidden(h_t)))))

        inputs = T.lmatrix('inputs')
        inputs_mask = T.bmatrix('inputs_mask')
        self.encode_fun = function(
                [inputs, inputs_mask],
                self.encode(inputs, inputs_mask))
Exemplo n.º 31
0
def build_network(input_size, hidden_size, constraint_adj=False):
    P = Parameters()
    X = T.bmatrix('X')

    P.W_input_hidden = U.initial_weights(input_size, hidden_size)
    P.b_hidden = U.initial_weights(hidden_size)
    P.b_output = U.initial_weights(input_size)
    hidden_lin = T.dot(X, P.W_input_hidden) + P.b_hidden
    hidden = T.nnet.sigmoid(hidden_lin)
    output = T.nnet.softmax(T.dot(hidden, P.W_input_hidden.T) + P.b_output)
    parameters = P.values()
    cost = build_error(X, output, P)
    if constraint_adj: pass
    #cost = cost + adjacency_constraint(hidden_lin)

    return X, output, cost, P
Exemplo n.º 32
0
def build_network(input_size,hidden_size,constraint_adj=False):
	P = Parameters()
	X = T.bmatrix('X')
	
	P.W_input_hidden = U.initial_weights(input_size,hidden_size)
	P.b_hidden       = U.initial_weights(hidden_size)
	P.b_output       = U.initial_weights(input_size)
	hidden_lin = T.dot(X,P.W_input_hidden)+P.b_hidden
	hidden = T.nnet.sigmoid(hidden_lin)
	output = T.nnet.softmax(T.dot(hidden,P.W_input_hidden.T) + P.b_output)
	parameters = P.values() 
	cost = build_error(X,output,P) 
	if constraint_adj:pass
		#cost = cost + adjacency_constraint(hidden_lin)

	return X,output,cost,P
Exemplo n.º 33
0
def run(gates, num_registers, max_int, num_timesteps, num_layers, reg_lambda,
        params, clip_gradients=None):
    params = make_broadcastable(params, clip_gradients=clip_gradients)

    # Create symbolic variables for the input to the machine
    # and for the desired output of the machine.
    initial_mem = dtensor3("InMem")
    desired_mem = imatrix("OutMem")
    cost_mask = bmatrix("CostMask")
    entropy_weight = dscalar("EntropyWeight")

    # Initialize all registers to zero. Instead of using to_one_hot,
    # create the shape directly; it's simpler this way.
    initial_registers = zeros((initial_mem.shape[0], num_registers, max_int),
                              dtype='float64')
    initial_registers = set_subtensor(initial_registers[:, :, 0], 1.0)

    # Run the model for all timesteps. The arguments are
    # registers, memory, cost, cumulative probability complete,
    # and probability incomplete. The latter are initialized
    # to zero and to one, respectively.
    v0 = as_tensor(0)
    v1 = as_tensor(1)
    output = (initial_registers, initial_mem, v0, v0, v1)
    debug = {}
    for timestep in range(num_timesteps):
        debug_local, output = step_cost(gates, max_int, desired_mem, cost_mask,
                                        num_timesteps, num_registers,
                                        num_layers, entropy_weight, 
                                        timestep + 1, *output, params)
        debug.update(("%d:%s" % (timestep, k), v)
                     for (k, v) in debug_local.items())


    # Add in regularization, to avoid overfitting simple examples.
    reg_cost = reg_lambda * sum((p * p).sum() for p in params)
    debug['cost-regularization'] = reg_cost

    # Get the final cost: regularization plus loss.
    final_cost = reg_cost + output[2].sum()
    debug['cost-final'] = final_cost

    # Return the symbolic variables, the final cost, and the
    # intermediate register values for analysis and prediction.
    mem = output[1]
    return debug, initial_mem, desired_mem, cost_mask, mem, final_cost, entropy_weight
Exemplo n.º 34
0
    def __init__(self, model):
        """ Initialize the stochastic block model for the adjacency matrix
        """
        self.model = model
        self.prms = model['network']['graph']
        self.N = model['N']
        self.N_dims = self.prms['N_dims']

        # Create a location prior
        self.location_prior = create_prior(self.prms['location_prior'])

        # Latent distance model has NxR matrix of locations L
        self.L = T.dvector('L')
        self.Lm = T.reshape(self.L, (self.N, self.N_dims))

        # Compute the distance between each pair of locations
        # Reshape L into a Nx1xD matrix and a 1xNxD matrix, then add the requisite
        # broadcasting in order to subtract the two matrices
        L1 = self.Lm.dimshuffle(0,'x',1)     # Nx1xD
        L2 = self.Lm.dimshuffle('x',0,1)     # 1xNxD
        T.addbroadcast(L1,1)
        T.addbroadcast(L2,0)
        #self.D = T.sqrt(T.sum((L1-L2)**2, axis=2))
        #self.D = T.sum((L1-L2)**2, axis=2)

        # Bummer, to get the gradients to work we need to use L1 norm
        # Theano isn't smart enough to handle the
        self.D = (L1-L2).norm(1, axis=2)

        # There is a distance scale, \delta
        self.delta = T.dscalar(name='delta')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # The probability of A is exponentially decreasing in delta
        self.pA = T.exp(-1.0*self.D/self.delta)

        if 'rho_refractory' in self.prms:
            self.pA += T.eye(self.N) * (self.prms['rho_refractory']-self.pA)
            # self.pA[np.diag_indices(self.N)] = self.prms['rho_refractory']

        # Define log probability
        self.log_p = T.sum(self.A * T.log(self.pA) + (1 - self.A) * T.log(1 - self.pA)) + \
                    self.location_prior.log_p(self.L)
Exemplo n.º 35
0
 def make_node(self, state, time):
     """Creates an Apply node representing the application of the op on 
     the inputs provided.
     
     Parameters
     ----------
     state : array_like
         The state to transform into feature space
     time : int
         The current time being processed
     
     Returns
     -------
     theano.Apply
         [description]
     """
     state = T.as_tensor_variable(state)
     time = T.as_tensor_variable(time)
     return theano.Apply(self, [state, time], [T.bmatrix()])
def declare_theano_variables(output_layer, model, verbose=True):
    """
    Define target, network output, cost and learning rate.

    Parameters
    ----------
    output_layer: Lasagne layer
        Output layer.
    model: model specification file
        Contains the model config.
    verbose: bool
        Print info if True.

    Returns
    -------
    target: Theano tensor
        Prediction target.
    stochastic_out: tuple
        Theano tensors for stochastic output and cost.
    deterministic_out: tuple
        Theano tensors for deterministic output and cost.
    learning_rate: Theano shared variable
        Learning rate for the optimizers.
    """

    if verbose:
        print('\tDeclaring theano variables...')

    # scale learning rate by a factor of 0.9 if momentum is applied,
    # to counteract the larger update steps that momentum yields
    lr = model.learning_rate - 0.9 * model.learning_rate * model.momentum
    learning_rate = theano.shared(np.asarray(lr, dtype=theano.config.floatX))

    # define target placeholder for the cost functions
    target = T.bmatrix('target')

    # stochastic cost expression
    stochastic_out = define_cost(output_layer, target, model, determ=False)

    # deterministic cost expression
    deterministic_out = define_cost(output_layer, target, model, determ=True)

    return target, stochastic_out, deterministic_out, learning_rate
Exemplo n.º 37
0
    def _initialize_predict_function(self):
        def predicted_note_step(time_model_output, *states):
            previous_note_model_input = states[-1]

            note_model_input = T.concatenate([time_model_output, previous_note_model_input])
            previous_hidden_state = list(states[:-1])
            note_model_output = self.note_model.forward(note_model_input, prev_hiddens=previous_hidden_state)
            probabilities = note_model_output[-1]

            generator = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024))

            is_note_played = probabilities[0] > generator.uniform()
            is_note_articulated = (probabilities[1] > generator.uniform()) * is_note_played
            prediction = T.cast(T.stack(is_note_played, is_note_articulated), 'int8')

            return note_model_output + [prediction]

        def predicted_time_step(*states):
            time_model_input = states[-2]
            previous_hidden_state = list(states[:-2])
            time_model_output = self.time_model.forward(time_model_input, prev_hiddens=previous_hidden_state)

            time_model_output_last_layer = time_model_output[-1]
            initial_note = T.alloc(0, output_size)
            note_outputs_info = self.get_time_prediction_outputs_info(initial_note)
            notes_model_output, updates = theano.scan(fn=predicted_note_step, sequences=[time_model_output_last_layer], outputs_info=note_outputs_info)

            output = notes_model_output[-1]
            time = states[-1]
            next_input = OutputTransformer()(output, time + 1)

            return (time_model_output + [next_input, time + 1, output]), updates

        length = T.iscalar()
        initial_note = T.bmatrix()

        num_notes = initial_note.shape[0]
        time_outputs_info = self.get_prediction_outputs_info(num_notes, initial_note)
        time_model_output, updates = theano.scan(fn=predicted_time_step, outputs_info=time_outputs_info, n_steps=length)
        prediction = time_model_output[-1]

        self.predict = theano.function([length, initial_note], outputs=prediction, updates=updates, allow_input_downcast=True)
Exemplo n.º 38
0
	def build_model(self):
		print("Building model and compiling functions...")
		self.sentences_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.sentences[0].shape[1:], dtype=np.int32), borrow=True)
		self.masks_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.masks[0].shape[1:], dtype=np.int8), borrow=True)
		self.labels_macro_batch = theano.shared(np.empty((self.macro_batch_size,) + self.labels[0].shape[1:], dtype=theano.config.floatX), borrow=True)

		sentences_in = T.imatrix('sentences')
		masks_in = T.bmatrix('masks')
		labels_in = T.fvector('labels')
		i = T.iscalar()
		
		flattened = self.define_layers(sentences_in,masks_in)

		self.model = flattened
		prediction = T.clip(lasagne.layers.get_output(flattened),1.0e-7, 1.0 - 1.0e-7)
		test_prediction = T.clip(lasagne.layers.get_output(flattened, deterministic=True), 1.0e-7, 1.0 - 1.0e-7)

		loss,test_loss = self.define_losses(prediction,test_prediction,labels_in)

		params = lasagne.layers.get_all_params(flattened, trainable=True)
		updates = lasagne.updates.adadelta(loss, params)
		
		self.train_fn = theano.function([i], [loss, prediction], updates=updates,
			givens={
			sentences_in: self.sentences_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size],
			masks_in: self.masks_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size],
			labels_in: self.labels_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size]})
		self.train_rest_fn = theano.function([i], [loss, prediction], updates=updates,
			givens={
			sentences_in: self.sentences_macro_batch[:i],
			masks_in: self.masks_macro_batch[:i],
			labels_in: self.labels_macro_batch[:i]})
		self.test_fn = theano.function([i], [test_loss, test_prediction],
			givens={
			sentences_in: self.sentences_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size],
			masks_in: self.masks_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size],
			labels_in: self.labels_macro_batch[i * self.micro_batch_size:(i + 1) * self.micro_batch_size]})
		self.test_rest_fn = theano.function([i], [test_loss, test_prediction],
			givens={
			sentences_in: self.sentences_macro_batch[:i],
			masks_in: self.masks_macro_batch[:i],
			labels_in: self.labels_macro_batch[:i]})
Exemplo n.º 39
0
    def __init__(self, model):
        """ Initialize the filtered stim model
        """
        self.model = model
        self.prms = model['network']['graph']
        N = model['N']

        self.rho = self.prms['rho'] * np.ones((N, N))

        if 'rho_refractory' in self.prms:
            self.rho[np.diag_indices(N)] = self.prms['rho_refractory']

        self.pA = theano.shared(value=self.rho, name='pA')

        # Define complete adjacency matrix
        self.A = T.bmatrix('A')

        # Define log probability
        self.log_p = T.sum(self.A * np.log(np.minimum(1.0-1e-8, self.rho)) +
                           (1 - self.A) * np.log(np.maximum(1e-8, 1.0 - self.rho)))
Exemplo n.º 40
0
    def test1_ndarray(self):

        i0 = TT.iscalar()
        i1 = TT.lvector()
        i2 = TT.bmatrix()

        
        f = FAS([i0])
        assert f.idx_tuple == (i0.type,)
        assert f.view_map == {0:[0]}
        assert f.n_in == 2

        f = FAS([i1])
        assert f.idx_tuple == (i1.type,)
        assert f.view_map == {}
        assert f.n_in == 2

        f = FAS([i2])
        assert f.idx_tuple == (i2.type,)
        assert f.view_map == {}
        assert f.n_in == 2
Exemplo n.º 41
0
def simpleRecurrentModel(
    word2vec,
    inputVocabSize,
    batch_size,
    maxGrad,
    hiddenSize,
):

    print("Building Model ...")

    # Input Layer
    l_in = lasagne.layers.InputLayer((batch_size, None), T.imatrix())
    l_mask = lasagne.layers.InputLayer((batch_size, None), T.bmatrix())

    #Embedding Layer
    l_embedding = lasagne.layers.EmbeddingLayer(incoming=l_in,
                                                input_size=inputVocabSize,
                                                output_size=word2vecDimension,
                                                W=word2vec)

    # Sentence Encoding
    l_encoding = lasagne.layers.GRULayer(l_embedding,
                                         hiddenSize,
                                         mask_input=l_mask,
                                         grad_clipping=maxGrad,
                                         only_return_final=True)

    # Intermediate Processing Layer
    l_classify = lasagne.layers.DenseLayer(
        l_encoding,
        num_units=hiddenSize,
        nonlinearity=lasagne.nonlinearities.rectify)

    # Predicting sentiment
    l_out = lasagne.layers.DenseLayer(
        l_classify, num_units=1, nonlinearity=lasagne.nonlinearities.sigmoid)

    return l_in, l_mask, l_out
Exemplo n.º 42
0
def GetProbFunctions(num_features, learning_rate=1e-4, ret_updates=True):
    adjustment_var = T.bmatrix(name='Adjustment matrix')
    features_var = T.fmatrix(name='Features')
    mask_var = T.bvector(name='Filter mask')
    reward_var = T.scalar(name='Reward')
    net = BuildGraphNetwork(adjustment_var, features_var, mask_var,
                            num_features)
    desc = lasagne.layers.get_output(net['desc'])
    prob = msoftmax(theano.gradient.grad_clip(desc, -1, 1))
    reward_grad = reward_var / prob
    params = lasagne.layers.get_all_params(net['desc'], trainable=True)
    grads = theano.grad(None, params, known_grads={prob: reward_grad})
    updates = lasagne.updates.momentum(grads,
                                       params,
                                       learning_rate=learning_rate)
    action_fn = theano.function([adjustment_var, features_var, mask_var], prob)
    if ret_updates:
        updates_fn = theano.function(
            [adjustment_var, features_var, mask_var, reward_var], [],
            updates=updates,
            allow_input_downcast=True)
        return net, action_fn, updates_fn
    else:
        return net, action_fn
def train():
    if not os.path.exists(train_dataset_path):
        generate_dataset()

    train_x, train_x_mask, train_y = cPickle.load(open(train_dataset_path, 'r'))
    valid_x, valid_x_mask, valid_y = cPickle.load(open(valid_dataset_path, 'r'))


    num_train_batchs = len(train_y) / batch_size
    num_valid_batchs = len(valid_y) / valid_batch_size

    print 't: %d, tb: %d, v: %d, vb: %d'%(len(train_y), num_train_batchs, len(valid_y), num_valid_batchs)


    shared_x_train, shared_y_train = shared_dataset(train_x, train_y)
    shared_mask = shared_data(train_x_mask, dtype = 'int8')

    shared_x_valid, shared_y_valid = shared_dataset(valid_x, valid_y)
    shared_valid_mask = shared_data(valid_x_mask, dtype = 'int8')


    index = T.lscalar('index')
    input_var = T.lmatrix('input')
    target_var = T.ivector('target')
    mask_var = T.bmatrix('mask')

    network = build_model(max_seq_length, input_var, mask_var)
    prediction = lasagne.layers.get_output(network)
    test_output = lasagne.layers.get_output(network, deterministic=True)

    test_acc =  T.mean( T.eq(T.argmax(test_output, axis = 1), target_var), dtype = theano.config.floatX)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var).mean()

    params = lasagne.layers.get_all_params(network, trainable = True)
    updates = lasagne.updates.adadelta(loss, params, learning_rate)

    train_fn = theano.function([index],
                            outputs = loss,
                            updates = updates,
                            givens={
                                    input_var: shared_x_train[index * batch_size: (index + 1) * batch_size],
                                    target_var: shared_y_train[index * batch_size: (index + 1) * batch_size],
                                    mask_var: shared_mask[index * batch_size: (index + 1) * batch_size],
                                    }
                            )

    valid_fn = theano.function([index],
                            outputs = test_acc,
                            givens={
                                    input_var: shared_x_valid[index * valid_batch_size: (index + 1) * valid_batch_size],
                                    target_var: shared_y_valid[index * valid_batch_size: (index + 1) * valid_batch_size],
                                    mask_var: shared_valid_mask[index * valid_batch_size: (index + 1) * valid_batch_size],
                                    }
                            )

    print 'compile over...'
    best_acc = 0.0
    for epoch in xrange(num_epoch):
        loss = 0.0
        acc = 0.0
        
        indices = range(0, num_train_batchs)
        numpy.random.shuffle(indices)
        start_time = time.time()

        for batch in indices:
            loss += train_fn(batch)

        valid_indices = range(0, num_valid_batchs)
        for batch in valid_indices:
            acc += valid_fn(batch)
        
        loss /= num_train_batchs
        acc /= num_valid_batchs

        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epoch, time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(loss))
        print("  valid accuracy:\t\t{:.2f} %\n".format(acc * 100))

        if best_acc < acc:
            best_acc = acc
            cPickle.dump((input_var, mask_var, network), open(lstm_path, 'w'))
            print 'save lstm to %s, best valid accuracy: %.2f%%\n'%(lstm_path, best_acc * 100)
Exemplo n.º 44
0
    def make_node(self, state, time):
        state = T.as_tensor_variable(state)
        time = T.as_tensor_variable(time)

        return theano.Apply(self, [state, time], [T.bmatrix()])
Exemplo n.º 45
0
 def make_node(self, state, time):
     state = T.as_tensor_variable(state)
     time = T.as_tensor_variable(time)
     return theano.Apply(self, [state, time], [T.bmatrix()])
Exemplo n.º 46
0
    def __init__(self, K, vocab_size, num_chars, W_init, regularizer, rlambda,
                 nhidden, embed_dim, dropout, train_emb, subsample, char_dim,
                 use_feat):
        self.nhidden = nhidden
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.train_emb = train_emb
        self.subsample = subsample
        self.char_dim = char_dim
        self.learning_rate = LEARNING_RATE
        self.num_chars = num_chars
        self.use_feat = use_feat

        norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1
        self.use_chars = self.char_dim != 0
        if W_init is None:
            W_init = lasagne.init.GlorotNormal().sample(
                (vocab_size, self.embed_dim))

        doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \
                T.wtensor3('cand')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')
        feat_var = T.imatrix('feat')
        doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars')
        tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask')
        cloze_var = T.ivector('cloze')
        self.inps = [
            doc_var, doc_toks, query_var, qry_toks, cand_var, target_var,
            docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var,
            cloze_var
        ]

        if rlambda > 0.:
            W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape)
        else:
            W_pert = W_init
        self.predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = (
            self.build_network(K, vocab_size, W_pert))

        self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        self.eval_fn = lasagne.objectives.categorical_accuracy(
            self.predicted_probs, target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        eval_fn_val = lasagne.objectives.categorical_accuracy(
            predicted_probs_val, target_var).mean()

        self.params = L.get_all_params([self.doc_net] + self.q_net,
                                       trainable=True)

        updates = lasagne.updates.adam(self.loss_fn,
                                       self.params,
                                       learning_rate=self.learning_rate)

        self.train_fn = theano.function(
            self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs],
            updates=updates,
            on_unused_input='warn')
        self.validate_fn = theano.function(
            self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val],
            on_unused_input='warn')
Exemplo n.º 47
0
    def setup_predict(self):
        # In prediction mode, note steps are contained in the time steps. So the passing gets a little bit hairy.

        self.predict_seed = T.bmatrix()
        self.steps_to_simulate = T.iscalar()

        def step_time(*states):
            # States is [ *hiddens, prev_result, time]
            hiddens = list(states[:-2])
            in_data = states[-2]
            time = states[-1]

            # correct for dropout
            if self.dropout > 0:
                masks = [1 - self.dropout for layer in self.time_model.layers]
                masks[0] = None
            else:
                masks = []

            new_states = self.time_model.forward(in_data, prev_hiddens=hiddens, dropout=masks)

            # Now new_states is a list of matrix [layer](notes, hidden_states) for each layer
            time_final = get_last_layer(new_states)

            start_note_values = theano.tensor.alloc(0, 2)

            # This gets a little bit complicated. In the training case, we can pass in a combination of the
            # time net's activations with the known choices. But in the prediction case, those choices don't
            # exist yet. So instead of iterating over the combination, we iterate over only the activations,
            # and then combine in the previous outputs in the step. And then since we are passing outputs to
            # previous inputs, we need an additional outputs_info for the initial "previous" output of zero.
            note_outputs_info = ([ initial_state_with_taps(layer) for layer in self.pitch_model.layers ] +
                                 [ dict(initial=start_note_values, taps=[-1]) ])

            notes_result, updates = theano.scan(fn=self._predict_step_note, sequences=[time_final], outputs_info=note_outputs_info)

            # Now notes_result is a list of matrix [layer/output](notes, onOrArtic)
            output = get_last_layer(notes_result)

            next_input = OutputFormToInputFormOp()(output, time + 1) # TODO: Fix time
            #next_input = T.cast(T.alloc(0, 3, 4),'int64')

            return (ensure_list(new_states) + [ next_input, time + 1, output ]), updates

        num_notes = self.predict_seed.shape[0]

        time_outputs_info = ([ initial_state_with_taps(layer, num_notes) for layer in self.time_model.layers ] +
                             [ dict(initial=self.predict_seed, taps=[-1]),
                               dict(initial=0, taps=[-1]),
                               None ])

        time_result, updates = theano.scan( fn=step_time,
                                            outputs_info=time_outputs_info,
                                            n_steps=self.steps_to_simulate )

        self.predict_thoughts = time_result

        self.predicted_output = time_result[-1]

        self.predict_fun = theano.function(
            inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed],
            outputs=self.predicted_output,
            updates=updates,
            allow_input_downcast=True)

        self.predict_thought_fun = theano.function(
            inputs=[self.steps_to_simulate, self.conservativity, self.predict_seed],
            outputs=ensure_list(self.predict_thoughts),
            updates=updates,
            allow_input_downcast=True)
Exemplo n.º 48
0
    def ready(self):
        embedding_layer = self.embedding_layer
        args = self.args
        padding_id = embedding_layer.vocab_map["<padding>"]

        dropout = self.dropout = theano.shared(
            np.float64(args.dropout).astype(theano.config.floatX))

        # len*batch
        x = self.x = T.imatrix()

        z = self.z = T.bmatrix()
        z = z.dimshuffle((0, 1, "x"))

        # batch*nclasses
        y = self.y = T.fmatrix()

        n_d = args.hidden_dimension
        n_e = embedding_layer.n_d
        activation = get_activation_by_name(args.activation)

        layers = self.layers = []
        depth = args.depth
        layer_type = args.layer.lower()
        for i in range(depth):
            if layer_type == "rcnn":
                l = ExtRCNN(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation,
                            order=args.order)
            elif layer_type == "lstm":
                l = ExtLSTM(n_in=n_e if i == 0 else n_d,
                            n_out=n_d,
                            activation=activation)
            layers.append(l)

        # len * batch * 1
        masks = T.cast(
            T.neq(x, padding_id).dimshuffle((0, 1, "x")) * z,
            theano.config.floatX)
        # batch * 1
        cnt_non_padding = T.sum(masks, axis=0) + 1e-8

        # (len*batch)*n_e
        embs = embedding_layer.forward(x.ravel())
        # len*batch*n_e
        embs = embs.reshape((x.shape[0], x.shape[1], n_e))
        embs = apply_dropout(embs, dropout)

        pooling = args.pooling
        lst_states = []
        h_prev = embs
        for l in layers:
            # len*batch*n_d
            h_next = l.forward_all(h_prev, z)
            if pooling:
                # batch * n_d
                masked_sum = T.sum(h_next * masks, axis=0)
                lst_states.append(masked_sum / cnt_non_padding)  # mean pooling
            else:
                lst_states.append(h_next[-1])  # last state
            h_prev = apply_dropout(h_next, dropout)

        if args.use_all:
            size = depth * n_d
            # batch * size (i.e. n_d*depth)
            h_final = T.concatenate(lst_states, axis=1)
        else:
            size = n_d
            h_final = lst_states[-1]
        h_final = apply_dropout(h_final, dropout)

        output_layer = self.output_layer = Layer(n_in=size,
                                                 n_out=self.nclasses,
                                                 activation=sigmoid)

        # batch * nclasses
        preds = self.preds = output_layer.forward(h_final)

        # batch
        loss_mat = self.loss_mat = (preds - y)**2
        loss = self.loss = T.mean(loss_mat)

        pred_diff = self.pred_diff = T.mean(
            T.max(preds, axis=1) - T.min(preds, axis=1))

        params = self.params = []
        for l in layers + [output_layer]:
            for p in l.params:
                params.append(p)
        nparams = sum(len(x.get_value(borrow=True).ravel()) \
                                        for x in params)
        say("total # parameters: {}\n".format(nparams))

        l2_cost = None
        for p in params:
            if l2_cost is None:
                l2_cost = T.sum(p**2)
            else:
                l2_cost = l2_cost + T.sum(p**2)
        l2_cost = l2_cost * args.l2_reg
        self.l2_cost = l2_cost

        cost = self.cost = loss * 10 + l2_cost
Exemplo n.º 49
0
def test_local_gpu_elemwise_0():
    """
    Test local_gpu_elemwise_0 when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Test multiple output
    a_s = theano.scalar.float32()
    a = tensor.fmatrix()
    from theano.scalar.basic import identity
    out_s = theano.scalar.Composite(
        [a_s, b_s, c_s],
        [identity(a_s), identity(c_s),
         identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1
    assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
    c = cuda.shared_constructor(c_v)
    f = theano.function([a, b],
                        outs_op(a[::2], b[::2], c[::2]),
                        mode=mode_with_gpu)
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Exemplo n.º 50
0
                RESNET_SGDM_LR.set_value(RESNET_SGDM_LR.get_value() * EPOCH_LR_COEFF)
                RECURR_SGDM_LR.set_value(RECURR_SGDM_LR.get_value() * EPOCH_LR_COEFF)
            ADAM_EPOCHS = 0
        else:
            for _ in xrange(max_epoch):
                RESNET_ADAM_LR.set_value(RESNET_ADAM_LR.get_value() * EPOCH_LR_COEFF)
                RECURR_ADAM_LR.set_value(RECURR_ADAM_LR.get_value() * EPOCH_LR_COEFF)
        NUM_EPOCHS -= max_epoch
        param_values_file = 'ln_hs_param_values_{}.pkl'.format(max_epoch)

    logger.info('Building the network.')
    im_features = lasagne.layers.get_output(resnet['pool5'])
    im_features = T.flatten(im_features, outdim=2) # batch size, number of features
    cap_out_var = T.imatrix('cap_out')  # batch size, seq len
    cap_in_var = T.imatrix('cap_in')    # batch size, seq len
    mask_var = T.bmatrix('mask_var')    # batch size, seq len
    gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
                               W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(0.0))
    cell_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
                                    W_cell=None, b=lasagne.init.Constant(0.0),
                                    nonlinearity=lasagne.nonlinearities.tanh)
    forget_gate = lasagne.layers.Gate(W_in=lasagne.init.Orthogonal(), W_hid=lasagne.init.Orthogonal(),
                                      W_cell=lasagne.init.Normal(), b=lasagne.init.Constant(5.0))
    l_in = lasagne.layers.InputLayer((None, None), cap_in_var, name="l_in")
    l_mask = lasagne.layers.InputLayer((None, None), mask_var, name="l_mask")
    l_hid = lasagne.layers.InputLayer((None, HIDDEN_SIZE), input_var=im_features, name="l_hid")
    l_emb = lasagne.layers.EmbeddingLayer(l_in, input_size=WORD_SIZE, output_size=EMBEDDING_SIZE, name="l_emb")
    l_lstm = LNLSTMLayer(l_emb, HIDDEN_SIZE, ingate=gate, forgetgate=forget_gate, cell=cell_gate,
                                    outgate=gate, hid_init=l_hid, peepholes=True, grad_clipping=RNN_GRAD_CLIP,
                                    mask_input=l_mask, precompute_input=False,
                                    alpha_init=lasagne.init.Constant(0.2), # as suggested by Ryan Kiros on Twitter
Exemplo n.º 51
0
def main(num_epochs=DEFAULT_NUM_EPOCHS, batch_size=DEFAULT_BATCH_SIZE):
    input_var = T.tensor4('inputs')
    target_var = T.bmatrix('targets')

    network = build_neural_network(
        input_var, (batch_size, 3, IMAGE_SIZE, IMAGE_SIZE))['prob']

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.binary_crossentropy(prediction, target_var)
    loss = loss.mean()

    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adadelta(loss, params)

    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.binary_crossentropy(test_prediction,
                                                       target_var)
    test_loss = test_loss.mean()

    test_accuracy = T.mean(T.eq(T.gt(test_prediction, 0.5),
                                T.eq(target_var, 1.0)),
                           dtype=theano.config.floatX)

    train_function = theano.function([input_var, target_var],
                                     loss,
                                     updates=updates)

    validation_function = theano.function([input_var, target_var],
                                          [test_loss, test_accuracy])

    number_of_image_files = get_number_of_image_files_in_path()

    print "Number of files found: ", number_of_image_files

    print("Starting training...")

    # Move this to the epoch loop and remove the cycle for lower memory computers
    data_generator = cycle(get_input_images_and_ouput_labels())
    best_accuracy = 0
    for epoch in range(num_epochs):
        train_generator = get_percentage_of_generator(data_generator,
                                                      number_of_image_files,
                                                      batch_size, 0.6)
        validation_generator = get_percentage_of_generator(
            data_generator, number_of_image_files, batch_size, 0.2)
        test_generator = get_percentage_of_generator(data_generator,
                                                     number_of_image_files,
                                                     batch_size, 0.2)

        # In each epoch, we do a full pass over the training data:
        train_error = 0
        train_batches = 0
        start_time = time.time()
        while True:
            try:
                batch = generate_minibatches(train_generator, batch_size)
            except StopIteration:
                break
            if len(batch) != batch_size:
                break
            X, Y = get_input_and_output_from_batch(batch)
            train_error += train_function(X, Y)
            train_batches += 1

        print "Finished training for epoch {} with a total of {} batches".format(
            epoch + 1, train_batches)

        # And a full pass over the validation data:
        val_error = 0
        val_accuracy = 0
        val_batches = 0
        while True:
            try:
                batch = generate_minibatches(validation_generator, batch_size)
            except StopIteration:
                break
            if len(batch) != batch_size:
                break
            X, Y = get_input_and_output_from_batch(batch)
            err, acc = validation_function(X, Y)
            val_error += err
            val_accuracy += acc
            val_batches += 1

        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(train_error / train_batches))
        print("  validation loss:\t\t{:.6f}".format(val_error / val_batches))
        print("  validation accuracy:\t\t{:.2f} %".format(val_accuracy /
                                                          val_batches * 100))

        print("  train / valid:\t\t{:.6f}".format(
            (train_error / train_batches) / (val_error / val_batches)))

        if val_accuracy > best_accuracy:
            print "Accuracy better than previous best, saving model"
            write_model_data(
                network,
                "models/model%s.pkl" % int(val_accuracy / val_batches * 100))
            best_accuracy = val_accuracy

    # After training, we compute and print the test error:
    test_error = 0
    test_accuracy = 0
    test_batches = 0
    while True:
        try:
            batch = generate_minibatches(test_generator, batch_size)
        except StopIteration:
            break
        if len(batch) != batch_size:
            break
        X, Y = get_input_and_output_from_batch(batch)
        err, acc = validation_function(X, Y)
        test_error += err
        test_accuracy += acc
        test_batches += 1
    print("Final results:")
    print("  test loss:\t\t\t{:.6f}".format(test_error / test_batches))
    print("  test accuracy:\t\t{:.2f} %".format(test_accuracy / test_batches *
                                                100))
Exemplo n.º 52
0
    def __init__(self, args, params=None, attention=False, bidir=False, subset_grad=True, pyramid=False):
        self.rnn_dim = args.rnn_dim
        self.rlayers = args.rlayers
        self.attention = attention

        lr = T.scalar(dtype=floatX)
        pdrop = T.scalar(dtype=floatX)
        max_norm = T.scalar(dtype=floatX)

        # initialize input tensors

        src_sent = T.imatrix('src_sent')
        rev_src_sent = T.imatrix('rev_src_sent')
        src_mask = T.bmatrix('src_mask')
        tgt_sent = T.imatrix('tgt_sent')
        tgt_mask = T.bmatrix('tgt_mask')
        space_mask = T.bmatrix('space_mask')

        # build up model
        # https://groups.google.com/forum/#!topic/torch7/-NBrFw8Q6_s
        # NOTE can't use one-hot here because huge matrix multiply
        self.L_enc = theano.shared(uniform_init(args.src_vocab_size, args.rnn_dim, scale=0.1),
                'L_enc', borrow=True)
        self.L_dec = theano.shared(uniform_init(args.tgt_vocab_size, args.rnn_dim, scale=0.1),
                'L_dec', borrow=True)
        enc_input = src_sent if not args.reverse else rev_src_sent
        if bidir:
            print('Using bidirectional encoder')
            self.encoder = BiRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args)
        elif pyramid:
            print('Using pyramid encoder')
            self.encoder = BiPyrRNNEncoder(src_sent.T, rev_src_sent.T, src_mask.T, self.L_enc, pdrop, args)
        else:
            self.encoder = RNNEncoder(enc_input.T, src_mask.T, space_mask.T, self.L_enc, pdrop, args)
        if attention:
            self.decoder = RNNDecoderAttention(self.encoder, tgt_sent.T, tgt_mask.T,
                    self.L_dec, pdrop, args)
            hs = self.decoder.hs
        else:
            self.decoder = RNNDecoder(self.encoder.out, tgt_sent.T, tgt_mask.T,
                    self.L_dec, pdrop, args)

        # cost, parameters, grads, updates

        self.cost = self.decoder.cost
        self.params = self.encoder.params + self.decoder.params + [self.L_enc, self.L_dec]
        if subset_grad:  # for speed
            self.grad_params = self.encoder.params + self.decoder.params + [self.encoder.subset, self.decoder.subset]
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm)
            # instead of updating L_enc and L_dec only want to update the embeddings indexed, so use inc_subtensor/set_subtensor
            # http://deeplearning.net/software/theano/tutorial/faq_tutorial.html
            self.updates[-2] = (self.L_enc, T.set_subtensor(self.updates[-2][0], self.updates[-2][1]))
            self.updates[-1] = (self.L_dec, T.set_subtensor(self.updates[-1][0], self.updates[-1][1]))
        else:
            self.grad_params = self.params
            self.updates, self.grad_norm, self.param_norm = get_opt_fn(args.optimizer)(self.cost, self.grad_params, lr, max_norm=max_norm)

        self.nparams = np.sum([np.prod(p.shape.eval()) for p in self.params])

        # functions

        self.train = theano.function(
            inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask,
                pdrop, lr, max_norm],
            outputs=[self.cost, self.grad_norm, self.param_norm],
            updates = self.updates,
            on_unused_input='warn',
            allow_input_downcast=True
        )
        self.test = theano.function(
            inputs=[src_sent, src_mask, rev_src_sent, tgt_sent, tgt_mask, space_mask, theano.In(pdrop, value=0.0)],
            outputs=self.cost,
            updates=None,
            on_unused_input='warn'
        )
        outputs=self.encoder.out
        if attention:
            outputs = self.encoder.out + [hs]
        self.encode = theano.function(
            inputs=[src_sent, rev_src_sent, src_mask, space_mask, theano.In(pdrop, value=0.0)],
            outputs=outputs,
            on_unused_input='warn',
            updates=None
        )

        # function for decoding step by step

        i_t = T.ivector()
        x_t = self.L_dec[i_t, :]
        h_ps = list()  # previous
        for k in xrange(args.rlayers):
            h_ps.append(T.matrix())
        h_ts = list()
        dmask = T.ones_like(h_ps[0]).astype(floatX)
        if attention and args.rlayers == 1:
            h_t, _ = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0], hs)
        else:
            h_t = self.decoder.rlayers[0]._step(x_t, dmask, h_ps[0])
        h_ts.append(h_t)
        # NOTE no more dropout nodes here
        for k in xrange(1, args.rlayers):
            if attention and args.rlayers == k + 1:
                h_t, align = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k], hs)
            else:
                h_t = self.decoder.rlayers[k]._step(h_t, dmask, h_ps[k])
            h_ts.append(h_t)
        E_t = T.dot(h_t, self.decoder.olayer.W) + self.decoder.olayer.b
        E_t = T.exp(E_t - T.max(E_t, axis=1, keepdims=True))
        p_t = E_t / E_t.sum(axis=1, keepdims=True)
        inputs=[i_t] + h_ps
        outputs=[p_t] + h_ts
        if attention:
            inputs = inputs + [hs]
            outputs = outputs + [align]
        self.decode_step = theano.function(
            inputs=inputs,
            outputs=outputs,
            updates=None
        )
Exemplo n.º 53
0
def test_local_gpu_elemwise():
    """
    Test local_gpu_elemwise when there is a dtype upcastable to float32
    """
    a = tensor.bmatrix()
    b = tensor.fmatrix()
    c = tensor.fmatrix()

    a_v = (numpy.random.rand(4, 5) * 10).astype("int8")
    b_v = (numpy.random.rand(4, 5) * 10).astype("float32")
    c_v = (numpy.random.rand(4, 5) * 10).astype("float32")

    # Due to optimization order, this composite is created when all
    # the op are on the gpu.
    f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    # Now test with the composite already on the cpu before we move it
    # to the gpu
    a_s = theano.scalar.int8()
    b_s = theano.scalar.float32()
    c_s = theano.scalar.float32()
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s])
    out_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v)

    return  # Not yet implemeted
    # Test multiple output
    a_s = theano.scalar.float32()
    a = tensor.fmatrix()
    from theano.scalar.basic import identity

    out_s = theano.scalar.Composite([a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v)
    utt.assert_allclose(out[1], c_v)
    utt.assert_allclose(out[2], b_v)

    # Test multiple output
    out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * b_s])
    outs_op = tensor.Elemwise(out_s)
    f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu)
    topo = f.maker.fgraph.toposort()
    assert sum(isinstance(node.op, GpuElemwise) for node in topo) == 1
    assert sum(type(node.op) == tensor.Elemwise for node in topo) == 0
    out = f(a_v, b_v, c_v)
    utt.assert_allclose(out[0], a_v + b_v)
    utt.assert_allclose(out[1], a_v * c_v)

    # Test non-contiguous input
    c = gpuarray_shared_constructor(numpy.asarray(c_v, dtype="float32"))
    f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu)
    out = f(a_v, b_v)
    utt.assert_allclose(out[0], a_v[::2] + b_v[::2])
    utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
Exemplo n.º 54
0
    def __init__(self, n_actions, replay_memory, build_network, updates, screen_size, initial_weights_file=None):
        self.screen_width, self.screen_height = screen_size
        self.mood_q = None
        self.last_q = 0
        self.n_parameter_updates = 0
        self.alpha = 0.00025
        # update frequency ?
        # gradient momentum ? 0.95
        # squared gradient momentum ? 0.95
        # min squared gradient ? 0.01
        self.save_every_n_frames = 100000  # ~ once per hour

        self.final_exploration_frame = 1000000
        self.replay_start_size = 50000
        self.i_action = 0

        self.state = None
        self.initial_epsilon = 1
        self.final_epsilon = 0.1
        self.epsilon = self.initial_epsilon
        self.gamma = 0.99
        self.replay_memory = replay_memory

        self.log_frequency = 1

        self.minibatch_size = 32
        # self.replay_memory_size = 1000000

        self.target_network_update_frequency = 10000

        s0_var = T.tensor4("s0", dtype=theano.config.floatX)
        a0_var = T.bmatrix("a0")
        r0_var = T.wcol("r0")
        s1_var = T.tensor4("s1", dtype=theano.config.floatX)
        future_reward_indicator_var = T.bcol("future_reward_indicator")

        self.n_actions = n_actions
        self.a_lookup = np.eye(self.n_actions, dtype=np.int8)

        self.network = build_network(n_actions=self.n_actions, input_var=T.cast(s0_var, 'float32') / np.float32(256),
                                     screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward.")
        self.forward = theano.function([s0_var], lasagne.layers.get_output(self.network, deterministic=True))

        self.network_stale = build_network(n_actions=self.n_actions, input_var=T.cast(s1_var, 'float32') / np.float32(256),
                                           screen_size=(self.screen_height, self.screen_width))
        print("Compiling forward_stale.")
        self.forward_stale = theano.function([s1_var],
                                             lasagne.layers.get_output(self.network_stale, deterministic=True))

        self._update_network_stale()

        out = lasagne.layers.get_output(self.network)
        out_stale = lasagne.layers.get_output(self.network_stale)
        self.loss, self.err, __y, __q = build_loss(out=out,
                                                   out_stale=out_stale,
                                                   a0_var=a0_var,
                                                   r0_var=r0_var,
                                                   future_reward_indicator_var=future_reward_indicator_var,
                                                   gamma=self.gamma)

        params = lasagne.layers.get_all_params(self.network, trainable=True)

        print("Compiling train_fn.")
        self.train_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                        [self.loss, self.err, T.transpose(__y), T.transpose(__q), out, out_stale],
                                        updates=updates(self.loss, params))
        print("Compiling loss_fn.")
        self.loss_fn = theano.function([s0_var, a0_var, r0_var, s1_var, future_reward_indicator_var],
                                       self.loss)
def execute(dataset,
            n_hidden_u,
            n_hidden_t_enc,
            n_hidden_t_dec,
            n_hidden_s,
            embedding_source=histo_GenotypicFrequency_perclass,
            additional_unsup_input=None,
            num_epochs=500,
            learning_rate=.001,
            learning_rate_annealing=1.0,
            alpha=1,
            beta=1,
            delta=1,
            gamma=1,
            lmd=.0001,
            disc_nonlinearity="sigmoid",
            encoder_net_init=0.2,
            decoder_net_init=0.2,
            optimizer="rmsprop",
            max_patience=100,
            batchnorm=0,
            input_dropout=1.0,
            embedding_noise=0.0,
            keep_labels=1.0,
            prec_recall_cutoff=True,
            missing_labels_val=-1.0,
            which_fold=0,
            early_stop_criterion='loss_sup_det',
            input_decoder_mode="regression",
            save_path='/Users/Marie-Elyse/Downloads/embedding2',
            save_copy='/Users/Marie-Elyse/Downloads/embedding2',
            dataset_path='/Users/Marie-Elyse/Downloads/embedding2',
            resume=False,
            exp_name='',
            random_proj=0,
            bootstrap_snp_embeddings=0,
            bootstrap_cutoff=0.9):

    # Prepare embedding information :
    # - If no embedding is specified, use the transposed input matrix
    # - If a file is specified, use it's content as feature embeddings
    # - Else (a embedding category like  'histo3x26' is provided), load a
    #   pregenerated embedding of the specified category
    if embedding_source is None or embedding_source == "raw":
        embedding_source = None
        embedding_input = 'raw'
    elif os.path.exists(embedding_source):
        embedding_input = embedding_source
    else:
        embedding_input = embedding_source
        embedding_source = os.path.join(
            dataset_path, embedding_input + '_fold' + str(which_fold) + '.npy')

    # Load the dataset
    print("Loading data")
    (x_train, y_train, exmpl_ids_train, x_valid, y_valid, exmpl_ids_valid,
     x_test, y_test, exmpl_ids_test, x_unsup, training_labels, feature_names,
     label_names) = mlh.load_data(dataset,
                                  dataset_path,
                                  embedding_source,
                                  which_fold=which_fold,
                                  keep_labels=keep_labels,
                                  missing_labels_val=missing_labels_val,
                                  embedding_input=embedding_input,
                                  norm=False)

    # Load the additional unsupervised data, if some is specified
    if additional_unsup_input is not None:
        print("Adding additional data to the model's unsupervised inputs")
        paths = additional_unsup_input.split(";")
        additional_unsup_data = [np.load(p) for p in paths]
        print(x_unsup.shape)
        x_unsup = np.hstack(additional_unsup_data + [x_unsup])
        print(x_unsup.shape)

    if x_unsup is not None:
        n_samples_unsup = x_unsup.shape[1]
    else:
        n_samples_unsup = 0

    original_x_train = x_train.copy()
    original_x_valid = x_valid.copy()
    original_x_test = x_test.copy()

    # Change how the missing data values are encoded. Right now they are
    # encoded as being the mean of the corresponding feature so that, after
    # feature normalization, they will be 0s. However, this prevents us from
    # transfering the minibatch data as int8 so we replace those values with -1s.
    for i in range(x_train.shape[1]):
        feature_mean = x_train[:, i].mean()
        x_train[:, i] = mh.replace_arr_value(x_train[:, i], feature_mean, -1)
        x_valid[:, i] = mh.replace_arr_value(x_valid[:, i], feature_mean, -1)
        x_test[:, i] = mh.replace_arr_value(x_test[:, i], feature_mean, -1)
    x_train = x_train.astype("int8")
    x_valid = x_valid.astype("int8")
    x_test = x_test.astype("int8")

    # Normalize the input data. The mlh.load_data() function already offers
    # this feature but we need to do it here so that we will have access to
    # both the normalized and unnormalized input data.
    norm_mus = original_x_train.mean(axis=0)
    norm_sigmas = original_x_train.std(axis=0) + 1e-6

    #x_train = (x_train - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_valid = (x_valid - norm_mus[None, :]) / norm_sigmas[None, :]
    #x_test = (x_test - norm_mus[None, :]) / norm_sigmas[None, :]

    #x_train *= (315345. / 553107)
    #x_valid *= (315345. / 553107)
    #x_test *= (315345. / 553107)

    # Setup variables to build the right type of decoder bases on the value of
    # `input_decoder_mode`
    assert input_decoder_mode in ["regression", "classification"]
    if input_decoder_mode == "regression":
        # The size of the input reconstruction will be the same as the number
        # of inputs
        decoder_encoder_unit_ratio = 1
    elif input_decoder_mode == "classification":
        # # The size of the input reconstruction will be the N times larger as
        # the number of inputs where N is the number of distinct discrete
        # values that each input can take. For SNP input data with an additive
        # coding scheme, N=3 because the 3 possible values are : {0, 1, 2}.
        nb_discrete_vals_by_input = int(original_x_train.max() + 1)
        decoder_encoder_unit_ratio = nb_discrete_vals_by_input

        # Print baseline accuracy for the imputation of genes
        print("Distribution of input values in valid: %f %f %f" %
              ((original_x_train == 0).mean(), (original_x_train == 1).mean(),
               (original_x_train == 2).mean()))
        print("Distribution of input values in test: %f %f %f" %
              ((original_x_test == 0).mean(), (original_x_test == 1).mean(),
               (original_x_test == 2).mean()))

    # Extract required information from data
    n_samples, n_feats = x_train.shape
    print("Number of features : ", n_feats)
    print("Glorot init : ", 2.0 / (n_feats + n_hidden_t_enc[-1]))
    n_targets = y_train.shape[1] if y_train.ndim == 2 else y_train.max() + 1

    # Set some variables
    batch_size = 138
    beta = gamma if (gamma == 0) else beta

    # Generate an name for the experiment based on the hyperparameters used
    if embedding_source is None:
        embedding_name = embedding_input
    else:
        embedding_name = embedding_source.replace("_", "").split(".")[0]
        exp_name += embedding_name.rsplit('/', 1)[::-1][0] + '_'

    exp_name += mlh.define_exp_name(
        keep_labels, alpha, beta, gamma, lmd, n_hidden_u, n_hidden_t_enc,
        n_hidden_t_dec, n_hidden_s, which_fold, learning_rate,
        decoder_net_init, encoder_net_init, batchnorm, input_dropout,
        embedding_noise, early_stop_criterion, learning_rate_annealing,
        input_decoder_mode)
    print("Experiment: " + exp_name)

    # Ensure that the folders where the results of the experiment will be
    # saved do exist. Create them if they don't.
    save_path = os.path.join(save_path, dataset, exp_name)
    save_copy = os.path.join(save_copy, dataset, exp_name)
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if not os.path.exists(save_copy):
        os.makedirs(save_copy)

    # Prepare Theano variables for inputs and targets
    input_var_sup = T.bmatrix('input_sup')
    input_var_unsup = theano.shared(x_unsup, 'input_unsup')  # x_unsup TBD
    target_var_sup = T.matrix('target_sup')
    lr = theano.shared(np.float32(learning_rate), 'learning_rate')

    # Use the provided mus and sigmas to process the missing values and
    # normalize the inputs
    b_input_var_sup = input_var_sup.astype("float32")
    normed_input_sup = (T.eq(b_input_var_sup, -1) * norm_mus +
                        T.neq(b_input_var_sup, -1) * b_input_var_sup)
    normed_input_sup = (normed_input_sup - norm_mus) / norm_sigmas

    reconst_target_sup = T.cast(input_var_sup, "int32")

    # Build model
    print("Building model")

    # Some checkings
    # assert len(n_hidden_u) > 0
    assert len(n_hidden_t_enc) > 0
    assert len(n_hidden_t_dec) > 0
    assert n_hidden_t_dec[-1] == n_hidden_t_enc[-1]

    # Build feature embedding networks (encoding and decoding if gamma > 0)
    nets, embeddings, pred_feat_emb = mh.build_feat_emb_nets(
        embedding_source, n_feats, n_samples_unsup, input_var_unsup,
        n_hidden_u, n_hidden_t_enc, n_hidden_t_dec, gamma, encoder_net_init,
        decoder_net_init, save_path, random_proj, decoder_encoder_unit_ratio,
        embedding_noise)

    # Build feature embedding reconstruction networks (if alpha > 0, beta > 0)
    nets += mh.build_feat_emb_reconst_nets(
        [alpha, beta], n_samples_unsup, n_hidden_u,
        [n_hidden_t_enc, n_hidden_t_dec], nets,
        [encoder_net_init, decoder_net_init])

    # Supervised network
    discrim_net, hidden_rep = mh.build_discrim_net(
        batch_size, n_feats, normed_input_sup, n_hidden_t_enc, n_hidden_s,
        embeddings[0], disc_nonlinearity, n_targets, batchnorm, input_dropout)

    # Reconstruct network
    nets += [
        mh.build_reconst_net(hidden_rep,
                             embeddings[1] if len(embeddings) > 1 else None,
                             n_feats * decoder_encoder_unit_ratio, gamma,
                             decoder_encoder_unit_ratio)
    ]

    # Load weights if we are resuming job
    if resume:
        # Load best model
        with np.load(os.path.join(save_copy, 'dietnet_best.npz')) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))
        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])

        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

    print("Building and compiling training functions")

    # Build and compile training functions
    predictions, predictions_det = mh.define_predictions(nets, start=2)
    prediction_sup, prediction_sup_det = mh.define_predictions([discrim_net])
    prediction_sup = prediction_sup[0]
    prediction_sup_det = prediction_sup_det[0]

    # Define losses
    # reconstruction losses
    if input_decoder_mode == "regression":
        reconst_losses, reconst_losses_det = mh.define_reconst_losses(
            predictions, predictions_det,
            [input_var_unsup, input_var_unsup, normed_input_sup])
    elif input_decoder_mode == "classification":
        # Obtain regular reconstruction losses for every reconstruction
        # but the reconstruction of the supervised input data
        reconst_losses1, reconst_losses_det1 = mh.define_reconst_losses(
            predictions[:-1], predictions_det[:-1],
            [input_var_unsup, input_var_unsup])

        # Obtain a "classification" reconstruction loss for the reconstruction
        # of the supervised input data. This classification loss will be
        # performed on the input data without normalization
        reconst_losses2, reconst_losses_det2 = mh.define_classif_reconst_losses(
            predictions[-1:], predictions_det[-1:], [reconst_target_sup],
            [decoder_encoder_unit_ratio])

        reconst_losses = reconst_losses1 + reconst_losses2
        reconst_losses_det = reconst_losses_det1 + reconst_losses_det2

    # supervised loss
    sup_loss, sup_loss_det = mh.define_sup_loss(disc_nonlinearity,
                                                prediction_sup,
                                                prediction_sup_det,
                                                keep_labels, target_var_sup,
                                                missing_labels_val)

    # Define inputs
    inputs = [input_var_sup, target_var_sup]

    # Define parameters
    params = lasagne.layers.get_all_params([discrim_net] + filter(None, nets),
                                           trainable=True,
                                           unwrap_shared=False)
    params_to_freeze= \
        lasagne.layers.get_all_params(filter(None, nets), trainable=False,
                                      unwrap_shared=False)

    # Remove unshared variables from params and params_to_freeze
    params = [
        p for p in params
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    params_to_freeze = [
        p for p in params_to_freeze
        if isinstance(p, theano.compile.sharedvalue.SharedVariable)
    ]
    print("Params : ", params)

    feat_emb_var = next(p for p in lasagne.layers.get_all_params([discrim_net])
                        if p.name == 'input_unsup' or p.name == 'feat_emb')
    # feat_emb_var = lasagne.layers.get_all_params([discrim_net])[0]
    print(feat_emb_var)
    feat_emb_val = feat_emb_var.get_value()
    feat_emb_norms = (feat_emb_val**2).sum(0)**0.5
    feat_emb_var.set_value(feat_emb_val / feat_emb_norms)

    print('Number of params discrim: ' + str(len(params)))
    print('Number of params to freeze: ' + str(len(params_to_freeze)))

    for p in params_to_freeze:
        new_params = [el for el in params if el != p]
        params = new_params

    print('Number of params to update: ' + str(len(params)))

    # Combine losses
    loss = delta*sup_loss + alpha*reconst_losses[0] + beta*reconst_losses[1] + \
        gamma*reconst_losses[2]
    loss_det = delta*sup_loss_det + alpha*reconst_losses_det[0] + \
        beta*reconst_losses_det[1] + gamma*reconst_losses_det[2]

    l2_penalty = apply_penalty(params, l2)
    loss = loss + lmd * l2_penalty
    loss_det = loss_det + lmd * l2_penalty

    # Compute network updates
    assert optimizer in ["rmsprop", "adam", "amsgrad"]
    if optimizer == "rmsprop":
        updates = lasagne.updates.rmsprop(loss, params, learning_rate=lr)
    elif optimizer == "adam":
        updates = lasagne.updates.adam(loss, params, learning_rate=lr)
    elif optimizer == "amsgrad":
        updates = lasagne.updates.amsgrad(loss, params, learning_rate=lr)
    #updates = lasagne.updates.sgd(loss,
    #                              params,
    #                              learning_rate=lr)
    # updates = lasagne.updates.momentum(loss, params,
    #                                    learning_rate=lr, momentum=0.0)

    # Apply norm constraints on the weights
    for k in updates.keys():
        if updates[k].ndim == 2:
            updates[k] = lasagne.updates.norm_constraint(updates[k], 1.0)

    # Compile training function
    train_fn = theano.function(inputs,
                               loss,
                               updates=updates,
                               on_unused_input='ignore')

    # Monitoring Labels
    monitor_labels = [
        "reconst. feat. W_enc", "reconst. feat. W_dec", "reconst. loss"
    ]
    monitor_labels = [
        i for i, j in zip(monitor_labels, reconst_losses) if j != 0
    ]
    monitor_labels += ["feat. W_enc. mean", "feat. W_enc var"]
    monitor_labels += ["feat. W_dec. mean", "feat. W_dec var"] if \
        (embeddings[1] is not None) else []
    monitor_labels += ["loss. sup.", "total loss"]

    # Build and compile test function
    val_outputs = reconst_losses_det
    val_outputs = [i for i, j in zip(val_outputs, reconst_losses) if j != 0]
    val_outputs += [embeddings[0].mean(), embeddings[0].var()]
    val_outputs += [embeddings[1].mean(), embeddings[1].var()] if \
        (embeddings[1] is not None) else []
    val_outputs += [sup_loss_det, loss_det]

    # Compute supervised accuracy and add it to monitoring list
    test_acc, test_pred = mh.define_test_functions(disc_nonlinearity,
                                                   prediction_sup,
                                                   prediction_sup_det,
                                                   target_var_sup)
    monitor_labels.append("accuracy")
    val_outputs.append(test_acc)

    # If appropriate, compute the input reconstruction accuracy and add it to
    # the monitoring list
    if input_decoder_mode == "classification":
        input_reconst_acc = mh.define_classif_reconst_acc(
            predictions_det[-1], reconst_target_sup,
            decoder_encoder_unit_ratio)
        #import pdb; pdb.set_trace()
        monitor_labels.append("input_reconst_acc")
        val_outputs.append(input_reconst_acc)

    # Compile prediction function
    predict = theano.function([input_var_sup], test_pred)
    predict_from_normed_inps = theano.function([normed_input_sup], test_pred)

    predict_scores = theano.function([input_var_sup], prediction_sup_det)
    predict_scores_from_normed_inps = theano.function([input_var_sup],
                                                      prediction_sup_det)

    # Compile validation function
    val_fn = theano.function(inputs, [prediction_sup_det] + val_outputs,
                             on_unused_input='ignore')

    # Finally, launch the training loop.
    print("Starting training...")

    # Some variables
    patience = 0

    train_monitored = []
    valid_monitored = []
    train_loss = []

    # Pre-training monitoring
    print("Epoch 0 of {}".format(num_epochs))

    train_minibatches = mlh.iterate_minibatches(x_train,
                                                y_train,
                                                batch_size,
                                                shuffle=False)
    train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                               monitor_labels, prec_recall_cutoff)

    valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                y_valid,
                                                batch_size,
                                                shuffle=False)
    valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                               monitor_labels, prec_recall_cutoff)

    # Before starting training, save a copy of the model in case
    np.savez(
        os.path.join(save_path, 'dietnet_best.npz'),
        *lasagne.layers.get_all_param_values(
            filter(None, nets) + [discrim_net]))

    # Training loop
    start_training = time.time()
    for epoch in range(num_epochs):
        start_time = time.time()
        print("Epoch {} of {}".format(epoch + 1, num_epochs))
        nb_minibatches = 0
        loss_epoch = 0

        # Train pass
        for batch in mlh.iterate_minibatches(x_train,
                                             training_labels,
                                             batch_size,
                                             shuffle=True):
            loss_epoch += train_fn(*batch)
            nb_minibatches += 1

        loss_epoch /= nb_minibatches
        train_loss += [loss_epoch]

        # Monitoring on the training set
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        train_monitored += [train_err]

        # Monitoring on the validation set
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)

        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)
        valid_monitored += [valid_err]

        try:
            early_stop_val = valid_err[monitor_labels.index(
                early_stop_criterion)]
        except:
            raise ValueError("There is no monitored value by the name of %s" %
                             early_stop_criterion)

        valid_loss_sup_hist = [
            v[monitor_labels.index("loss. sup.")] for v in valid_monitored
        ]
        valid_loss_sup = valid_loss_sup_hist[-1]

        # Early stopping
        if epoch == 0:
            best_valid = early_stop_val
        elif ((early_stop_val > best_valid
               and early_stop_criterion == 'input_reconst_acc')
              or (early_stop_val > best_valid
                  and early_stop_criterion == 'accuracy')
              or (early_stop_val >= best_valid
                  and early_stop_criterion == 'accuracy'
                  and valid_loss_sup == min(valid_loss_sup_hist))
              or (early_stop_val < best_valid
                  and early_stop_criterion == 'loss. sup.')):
            best_valid = early_stop_val
            patience = 0

            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_best.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_best.npz",
                     zip(*train_monitored), zip(*valid_monitored))

            # Monitor on the test set now because sometimes the saving doesn't
            # go well and there isn't a model to load at the end of training
            if y_test is not None:
                test_minibatches = mlh.iterate_minibatches(x_test,
                                                           y_test,
                                                           138,
                                                           shuffle=False)

                test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                          monitor_labels, prec_recall_cutoff)
        else:
            patience += 1
            # Save stuff
            np.savez(
                os.path.join(save_path, 'dietnet_last.npz'),
                *lasagne.layers.get_all_param_values(
                    filter(None, nets) + [discrim_net]))
            np.savez(save_path + "/errors_supervised_last.npz",
                     zip(*train_monitored), zip(*valid_monitored))

        print("  epoch time:\t\t\t{:.3f}s \n".format(time.time() - start_time))

        # End training if needed
        if patience == max_patience or epoch == num_epochs - 1:
            break

        # Anneal the learning rate
        lr.set_value(
            np.array(lr.get_value() * learning_rate_annealing,
                     dtype="float32"))

    # End training with a final monitoring step on the best model
    print("Ending training")

    # Load best model
    with np.load(os.path.join(save_path, 'dietnet_best.npz')) as f:
        param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        nlayers = len(
            lasagne.layers.get_all_params(filter(None, nets) + [discrim_net]))

        #lasagne.layers.set_all_param_values(filter(None, nets) +
        #                                    [discrim_net],
        #                                    param_values[:nlayers])
        params = lasagne.layers.get_all_params(
            filter(None, nets) + [discrim_net])
        for p, v in zip(params, param_values[:nlayers]):
            # Do not overwrite embedding value with old embedding. Removing
            # the following condition will prevent a trained model from being
            # tested on a different dataset
            if p.name != "feat_emb":
                p.set_value(v)

        if embedding_source is None:
            # Save embedding
            pred = pred_feat_emb()
            np.savez(os.path.join(save_path, 'feature_embedding.npz'), pred)

        # Training set results
        train_minibatches = mlh.iterate_minibatches(x_train,
                                                    y_train,
                                                    batch_size,
                                                    shuffle=False)
        train_err = mlh.monitoring(train_minibatches, "train", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Validation set results
        valid_minibatches = mlh.iterate_minibatches(x_valid,
                                                    y_valid,
                                                    batch_size,
                                                    shuffle=False)
        valid_err = mlh.monitoring(valid_minibatches, "valid", val_fn,
                                   monitor_labels, prec_recall_cutoff)

        # Test set results
        if y_test is not None:
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)

            test_err = mlh.monitoring(test_minibatches, "test", val_fn,
                                      monitor_labels, prec_recall_cutoff)

            # Test the model's accuracy with varying levels of provided SNPs
            test_minibatches = mlh.iterate_minibatches(x_test,
                                                       y_test,
                                                       138,
                                                       shuffle=False)
            mlh.eval_prediction(test_minibatches,
                                "test (rescaled)",
                                predict_from_normed_inps,
                                norm_mus,
                                norm_sigmas,
                                nb_evals=1,
                                rescale_inputs=True)

        # Save the model's test predictions to file
        print(x_test.shape)
        test_predictions = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_predictions += [predict(minibatch)]
        print(len(test_predictions))
        print(sum([t.shape[0] for t in test_predictions]))
        np.savez(os.path.join(save_path, 'test_predictions.npz'),
                 test_predictions)

        # Get the scores assigned by the model to each class for each test sample
        test_scores = []
        for minibatch in mlh.iterate_testbatches(x_test, 1, shuffle=False):
            test_scores += [predict_scores(minibatch)]
        np.savez(os.path.join(save_path, 'test_scores.npz'), test_scores)

        # Generate new SNP embeddings using test examples labeled according
        # to the model's predictions
        if bootstrap_snp_embeddings:

            if bootstrap_cutoff == "soft":
                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     x_test.transpose()))
                bootstrap_labels = np.vstack(
                    (y_train, y_valid, np.array(test_scores)[:, 0, :]))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_softlabels.npy'
                filename_allelic = 'bootstrap_all_snp_embeddings_softlabels.npy'

            else:  # Hard cutoff
                sure_test_idxs = np.argwhere(
                    (np.array(test_scores)[:, 0, :] >
                     bootstrap_cutoff).sum(1)).flatten()
                sure_test_inputs = x_test[sure_test_idxs]
                sure_test_preds = np.array(test_scores)[sure_test_idxs,
                                                        0].argmax(1)

                bootstrap_snp_data = np.hstack(
                    (x_train.transpose(), x_valid.transpose(),
                     sure_test_inputs.transpose()))
                bootstrap_labels = np.hstack(
                    (y_train.argmax(1), y_valid.argmax(1), sure_test_preds))

                filename_genotypic = 'bootstrap_gen_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff
                filename_allelic = 'bootstrap_all_snp_embeddings_cutoff%f.npy' % bootstrap_cutoff

            utils_helpers.generate_snp_hist(
                bootstrap_snp_data,
                bootstrap_labels,
                label_names=label_names,
                perclass=True,
                sum_to_one=True,
                filename_genotypic=os.path.join(save_path, filename_genotypic),
                filename_allelic=os.path.join(save_path, filename_allelic))

    # Print all final errors for train, validation and test
    print("Training time:\t\t\t{:.3f}s".format(time.time() - start_training))

    # Analyse the model gradients to determine the influence of each SNP on
    # each of the model's prediction
    print(label_names)
    class_idx = T.iscalar("class index")
    grad_fn = theano.function([input_var_sup, class_idx],
                              T.grad(prediction_sup_det[:, class_idx].mean(),
                                     input_var_sup).mean(0))
    grads_wrt_inputs = mlh.get_grads_wrt_inputs(x_test, grad_fn, feature_names,
                                                label_names)

    # Obtain function that takes as inputs normed inputs and returns the
    # gradient of a class score wrt the normed inputs themselves (this is
    # requird because computing the integrated gradients requires to be able
    # to interpolate between an example where all features are missing and an
    # example where any number of features are provided)
    grad_from_normed_fn = theano.function(
        [normed_input_sup, class_idx],
        T.grad(prediction_sup_det[:, class_idx].sum(),
               normed_input_sup).mean(0))

    # Collect integrated gradients over the whole test set. Obtain, for each
    # SNP, for each possible value (0, 1 or 2), the average contribution of that
    # value for what SNP to the score of each class.
    avg_int_grads = np.zeros((x_test.shape[1], 3, len(label_names)),
                             dtype="float32")
    counts_int_grads = np.zeros((x_test.shape[1], 3), dtype="int32")
    for test_idx in range(x_test.shape[0]):
        int_grads = mlh.get_integrated_gradients(x_test[test_idx],
                                                 grad_from_normed_fn,
                                                 feature_names,
                                                 label_names,
                                                 norm_mus,
                                                 norm_sigmas,
                                                 m=100)

        snp_value_mask = np.arange(3) == x_test[test_idx][:, None]
        avg_int_grads += snp_value_mask[:, :,
                                        None] * int_grads.transpose()[:,
                                                                      None, :]
        counts_int_grads += snp_value_mask
    avg_int_grads = avg_int_grads / counts_int_grads[:, :, None]

    # Save all the additional information required for model analysis :
    # - Test predictions
    # - SNP IDs
    # - Subject IDs
    # - Normalization parameters for the input minibatches
    np.savez(os.path.join(save_path, 'additional_data.npz'),
             test_labels=y_test,
             test_scores=np.array(test_scores)[:, 0],
             test_predictions=np.array(test_predictions)[:, 0],
             norm_mus=norm_mus,
             norm_sigmas=norm_sigmas,
             grads_wrt_inputs=grads_wrt_inputs,
             exmpl_ids_train=exmpl_ids_train,
             exmpl_ids_valid=exmpl_ids_valid,
             exmpl_ids_test=exmpl_ids_test,
             feature_names=feature_names,
             label_names=label_names,
             avg_int_grads=avg_int_grads)

    # Copy files to loadpath (only if some training has beeen done so there
    # is a local saved version)
    if save_path != save_copy and num_epochs > 0:
        print('Copying model and other training files to {}'.format(save_copy))
        copy_tree(save_path, save_copy)
Exemplo n.º 56
0
    def model_setup(self, mfile=None, num_units1=128, num_units2=128,
                          lrate=2e-3, drate=0.95, eps=1e-8, bptt_maxdepth=50,
                          l1=0, l2=0, char_dim=None):
        """initialization of the 2-layer LSTM model for learning or for
           the generation of sequences"""

        # the default parameters are identical to Andrej Karpathy's
        # (see https://github.com/karpathy/char-rnn)

        # 2-layer LSTM parameters
        self.p = {'U1': None, 'W1': None, 'b1': None,
                  'U2': None, 'W2': None, 'b2': None,
                  'V': None, 'c': None}
        # learning parameters
        self.lp = {'lrate': lrate, # learning rate
                   'drate': drate, # decay rate for rmsprop
                   'eps': eps, # epsilon parameter for rmsprop
                   'bptt_maxdepth': bptt_maxdepth, # backpropagation cutoff
                   'l1': l1, # L1 regularization parameter
                   'l2': l2 # L2 regularization parameter
                  }

        if mfile is not None: # loading parameters from an npz file
            np_init = self.load_params(mfile)
            num_units1 = np_init['b1'].shape[1]
            num_units2 = np_init['b2'].shape[1]
        else:
            if char_dim is None:
                if self.uchar:
                    char_dim = len(self.uchar)
                else:
                    raise Exception('prepare_input() should be run before ' +
                                    'model_setup() unless mfile is provided')

            # initialize small random weights
            r_char_dim = np.sqrt(1./(char_dim))
            r_units1 = np.sqrt(1./(num_units1))
            r_units2 = np.sqrt(1./(num_units2))

            def uniform(rng, shape):
                return np.random.uniform(-rng, rng,
                                         shape).astype(theano.config.floatX)

            def randn(rng, shape):
                return np.random.uniform(-rng, rng,
                                         shape).astype(theano.config.floatX)

            def bias_hack(num_units):
                b = np.zeros((4, num_units))
                b[0] = 1. # forget gate hack
                          # helps the network remember information
                return b.astype(theano.config.floatX)

            def zeros(shape):
                return np.zeros(shape).astype(theano.config.floatX)

            def ones(shape):
                return np.ones(shape).astype(theano.config.floatX)

            # parameters for the gates
            # [0]: forget
            # [1]: input
            # [2]: output
            # [3]: cell state update
            np_init = {}

            # first layer
            np_init['U1'] = uniform(r_char_dim, (4, num_units1, char_dim))
            np_init['W1'] = uniform(r_units1, (4, num_units1, num_units1))
            np_init['b1'] = bias_hack(num_units1)

            # second layer
            np_init['U2'] = uniform(r_units1, (4, num_units2, num_units1))
            np_init['W2'] = uniform(r_units2, (4, num_units2, num_units2))
            np_init['b2'] = bias_hack(num_units2)

            # parameters for the last layer (cell output -> network output)
            np_init['V'] = uniform(r_units2, (char_dim, num_units2))
            np_init['c'] = zeros(char_dim)

            # dynamical learning rate (in case the user wants to modify it
            # during the learning process)
            if theano.config.floatX == 'float32':
                dyn_lrate_init = np.float32(self.lp['lrate'])
            else:
                dyn_lrate_init = np.float64(self.lp['lrate'])
            self.dyn_lrate = theano.shared(dyn_lrate_init, name='dyn_lrate')

            # parameters for rmsprop (running average of gradients)
            msq_g = {}
            for param in self.p:
                msq_g[param] = theano.shared(zeros(np_init[param].shape),
                                             name='msq_g'+param)

        for param in self.p:
            self.p[param] = theano.shared(np_init[param], name=param)

        if self.batch_size > 1:
            x = T.imatrix('x')
            y = T.btensor3('y')
        else:
            x = T.ivector('x')
            y = T.bmatrix('y')

        def forward_prop(x, ht1m1, Ct1m1, ht2m1, Ct2m1,
                         U1, W1, b1, U2, W2, b2, V, c):
            # defines each time step of the RNN model

            if self.batch_size > 1: # transform into column vectors
                col_b1 = b1.dimshuffle((0,1,'x'))
                col_b2 = b2.dimshuffle((0,1,'x'))
                col_c = c.dimshuffle((0,'x'))
            else:
                col_b1 = b1
                col_b2 = b2
                col_c = c

            # layer 1
            gates1 = []
            for i in xrange(3): # forget, input and output gates
                gates1.append(T.nnet.sigmoid(U1[i][:,x] +
                                             W1[i].dot(ht1m1) +
                                             col_b1[i]))
            tentative_Ct1 = T.tanh(U1[3][:,x] + W1[3].dot(ht1m1) + col_b1[3])

            Ct1 = Ct1m1 * gates1[0] + tentative_Ct1 * gates1[1]
            ht1 = gates1[2] * T.tanh(Ct1)

            # layer 2
            gates2 = []
            for i in xrange(3): # forget, input and output gates
                gates2.append(T.nnet.sigmoid(U2[i].dot(ht1) +
                                             W2[i].dot(ht2m1) +
                                             col_b2[i]))
            tentative_Ct2 = T.tanh(U2[3].dot(ht1) + W2[3].dot(ht2m1) +
                                   col_b2[3])

            Ct2 = Ct2m1 * gates2[0] + tentative_Ct2 * gates2[1]
            ht2 = gates2[2] * T.tanh(Ct2)

            # final layer
            o = T.nnet.softmax((V.dot(ht2) + col_c).T)

            return [o, ht1, Ct1, ht2, Ct2]

        if self.batch_size > 1:
            ht1_Ct1_size = (num_units1, self.batch_size)
            ht2_Ct2_size = (num_units2, self.batch_size)
        else:
            ht1_Ct1_size = num_units1
            ht2_Ct2_size = num_units2

        [o, ht1, Ct1, ht2, Ct2], updates = theano.scan(
            fn=forward_prop,
            sequences=x,
            outputs_info=[None,
                            T.zeros(ht1_Ct1_size),
                            T.zeros(ht1_Ct1_size),
                            T.zeros(ht2_Ct2_size),
                            T.zeros(ht2_Ct2_size)
                           ],
            non_sequences=[self.p['U1'], self.p['W1'], self.p['b1'],
                           self.p['U2'], self.p['W2'], self.p['b2'],
                           self.p['V'], self.p['c']],
            truncate_gradient=self.lp['bptt_maxdepth'],
            strict=True)

        # o is a (seq_len, batch_size, char_dim) tensor---even if batch_size=1
        prediction = T.argmax(o, axis=2)

        self.theano_predict = theano.function(
            inputs=[x],
            outputs=[o, prediction],
        )

        if mfile is not None: # not here for learning; we can stop here
            return

        # compute the cross-entropy loss
        xent = (-y*T.log(o)).sum(axis=2) # (string_len, batch_size) matrix
        cost = T.mean(xent)

        # regularization using L1 and/or L2 norms
        reg_cost = cost

        # cast into theano.config.floatX is a trick to avoid float64 below
        tot_shape = (xent.shape[0] * xent.shape[1]).astype(theano.config.floatX)

        for param in self.p:
            if l1 > 0: # L1 regularization
                reg_cost += l1 * T.sum(abs(self.p[param])) / tot_shape
            if l2 > 0: # L2 regularization
                reg_cost += l2 * T.sum(self.p[param] ** 2) / tot_shape

        g = {}
        for param in self.p:
            g[param] = T.grad(reg_cost, self.p[param])

        # for rmsprop
        new_msq_g = {}
        updates = {}
        rmsprop_updates = []
        sgd_updates = []
        ratios = {}
        for param in self.p:
            new_msq_g[param] = (self.lp['drate'] * msq_g[param] +
                               (1. - self.lp['drate']) * g[param]**2)

            updates[param] = (self.dyn_lrate * g[param] /
                             (T.sqrt(new_msq_g[param]) + self.lp['eps']))

            # update to parameter scale ratio
            ratios[param] = (T.flatten(updates[param]).norm(2) /
                             T.flatten(self.p[param]).norm(2))

            sgd_updates.append((self.p[param],
                                self.p[param] - self.dyn_lrate * g[param]))

            rmsprop_updates.append((self.p[param],
                                    self.p[param] - updates[param]))
            rmsprop_updates.append((msq_g[param], new_msq_g[param]))

            # todo: add possibility to clip gradients to some value

        f_out = [cost, prediction]

        # compute cost and prediction but do not update the weights
        self.theano_check = theano.function(
            inputs=[x, y],
            outputs=f_out,
        )

        f_out.extend([ratios['U1'], ratios['W1'], ratios['b1'],
                      ratios['U2'], ratios['W2'], ratios['b2'],
                      ratios['V'], ratios['c']])

        # mini-batch training with rmsprop
        self.theano_train_rmsprop = theano.function(
            inputs=[x, y],
            outputs=f_out,
            updates=rmsprop_updates
        )

        # mini-batch training with stochastic gradient descent
        self.theano_train_sgd = theano.function(
            inputs=[x, y],
            outputs=f_out,
            updates=sgd_updates
        )