Exemplo n.º 1
0
def train(batch_size, lr, epochs, period):
    assert period >= batch_size and period % batch_size == 0
    params, sqrs, vs = adam_init_params()
    w = params[0]
    b = params[1]
    total_loss = [np.mean(square_loss(net(X, w, b), y).asnumpy())]
    t = 0
    for epoch in range(1, epochs + 1):
        if epoch > 2:
            lr *= 0.1
        for batch_i, data, label in data_iter(batch_size):
            with autograd.record():
                output = net(data, w, b)
                loss = square_loss(output, label)
            loss.backward()
            t += 1
            optimizer.adam(params, sqrs, vs, batch_size, lr, t)
            if batch_i * batch_size % period == 0:
                total_loss.append(np.mean(square_loss(net(X,w ,b), y).asnumpy()))
        print("Batch size %d, Learning rate %f, Epoch %d, loss %.4e" % (
            batch_size, lr, epoch, total_loss[-1]
        ))
    print("w:", np.reshape(w.asnumpy(), (1, -1)),
          "b:", b.asnumpy()[0])
    print("Total loss length:", len(total_loss))
    x_axis = np.linspace(0, epochs, len(total_loss), endpoint=True)
    plt.semilogy(x_axis, total_loss)
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.show()
    def __init__(self, x, y, l, window, opt, lr, init_emb, dim_emb, dim_hidden, n_vocab, L2_reg, unit,
                 sim='cos', n_layers=1, activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]

        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))

        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers, activation)
        self.params.extend(params)

        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out), h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)

        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size), y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.

        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)

        """ Predicts """
        y_hat = T.argmax(y_score, 1)

        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)
Exemplo n.º 3
0
def run_bm(bm_fname, optz_cfg=None, verbose=True, misc=None):
    # load bm_fname
    bm = importlib.import_module(bm_fname.rsplit('.', 1)[0])
    e         = bm.e; decorate_stind(e)
    thts_init = bm.thts_init
    compare   = bm.compare
    if optz_cfg is None: optz_cfg = bm.optz_cfg

    # optz_detail
    optz_detail = get_optz_detail(bm_fname, optz_cfg)
    print('\n===== OPTZ: %s =====' % optz_detail)

    # run experiments
    for alg_str in compare:
        print('[%s] ' % alg_str, end='')
        alg = importlib.import_module(compare[alg_str].rsplit('.',1)[0])
        alg.init(e)
        misc_arg = {'misc':misc} if alg_str == 'ours2' else {}

        # run adam
        grad_func = lambda thts, e=e: alg.elbo_grad(e, thts, **misc_arg)
        thts_res = optimizer.adam(grad_func, thts_init,
                                  iter_n   = optz_cfg['iter_n'],
                                  lr       = optz_cfg['lr'],
                                  sample_n_grad = optz_cfg['sample_n_grad'],
                                  sample_n_var  = optz_cfg['sample_n_var'],
                                  verbose  = verbose,
                                  **misc_arg)

        # save res to file
        save_res(optz_detail, alg_str, thts_res)
Exemplo n.º 4
0
def ScbowTrain(file):
	corpus , word_to_id , id_to_word = preprocess(file)

	contexts, target = create_context_target(corpus, window_size = 1)
	vocab_size = len(word_to_id)


	target = one_hot_v(target,vocab_size)
	contexts = one_hot_v(contexts,vocab_size)

	model = scbow(vocab_size , hidden_size)
	optimizer = adam()
	train = Trainer(model, optimizer)



	
	train.fit(contexts, target, max_epoch, batch_size)
	train.plot()

	word_vecs = model.word_vecs

	for word_id, word in id_to_word.items():
		print(word,word_vecs[word_id])

	C = co_mat(corpus,vocab_size,window_size = 1)
	ms('彼女',word_to_id,id_to_word,C,top = 10)
Exemplo n.º 5
0
    def build_model(self, lr=0.001, dropout=None):

        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))

        # description string: #words x #samples

        x = T.matrix('x', dtype='int32')  # step * samples
        x_mask = T.matrix('x_mask', dtype='float32')  # step * samples
        y = T.matrix('y', dtype='int32')  # sample * emb
        ctx = T.tensor3('ctx', dtype='float32')  # sample * annotation * dim

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        emb = self.W_emb[x.flatten()]

        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])

        ctx0 = ctx
        ctx_mean = ctx0.mean(1)

        init_state = T.dot(ctx_mean, self.W_hidden_init) + self.b_hidden_init
        init_memory = T.dot(ctx_mean, self.W_memory_init) + self.b_memory_init

        # proj : lstm hidden 들의 리스트
        proj = self.lstm_layer(emb,
                               mask=x_mask,
                               context=ctx,
                               init_state=init_state,
                               init_memory=init_memory)
        proj_h = proj[0]

        # hidden 들의 평균
        proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0)
        proj_h = proj_h / x_mask.sum(axis=0)[:, None]  # sample * dim

        # 마지막 hidden
        #proj_h = proj_h[-1]  # sample * dim

        if dropout is not None:
            proj_h = dropout_layer(proj_h, use_noise, trng, dropout)

        output = T.dot(proj_h, self.W_pred) + self.b_pred

        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)

        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN

        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)

        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
Exemplo n.º 6
0
 def update(self, lr=0.00001, weight_decay=0.0004):
   '''
   # mini-batch SGD
   self.weights *= (1 - weight_decay)
   self.bias *= (1 - weight_decay)
   self.weights -= lr * self.d_weights
   self.bias -= lr * self.d_bias
   '''
   # adam optimizer
   self.weights, self.config_w = adam(self.weights * (1 - weight_decay),
                                      self.d_weights, 
                                      config=self.config_w)
   self.bias, self.config_b = adam(self.bias * (1 - weight_decay),
                                   self.d_bias, 
                                   config=self.config_b)
   # clear gradients
   self.d_weights = np.zeros(self.weights.shape)
   self.d_bias = np.zeros(self.bias.shape)
Exemplo n.º 7
0
    def build_model(self, lr=0.001):
    
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.matrix('x', dtype = 'int32')
        x_mask = T.matrix('x_mask', dtype='float32')
        y = T.matrix('y', dtype = 'int32')
        img = T.matrix('img', dtype = 'float32')
        
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        init_state = T.dot(img, self.W_img_emb) + self.b_img_emb
        emb = self.W_emb[x.flatten()]
        
        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])
        # proj : gru hidden 들의 리스트   
        proj = self.gru_layer(emb, init_state, mask=x_mask)
    
        
        # hidden 들의 평균
        proj = (proj * x_mask[:, :, None]).sum(axis=0)
        proj = proj / x_mask.sum(axis=0)[:, None]  # sample * dim
        
        # 마지막 hidden
        #proj = proj[-1]  # sample * dim
        


        
        output = T.dot(proj, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)

        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
        
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, img, y, cost, updates, prediction
Exemplo n.º 8
0
    def build_model(self, lr=0.001):

        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))

        # description string: #words x #samples

        x = T.matrix('x', dtype='int32')
        x_mask = T.matrix('x_mask', dtype='float32')
        y = T.matrix('y', dtype='int32')
        img = T.matrix('img', dtype='float32')

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        init_state = T.dot(img, self.W_img_emb) + self.b_img_emb
        emb = self.W_emb[x.flatten()]

        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])
        # proj : gru hidden 들의 리스트
        proj = self.gru_layer(emb, init_state, mask=x_mask)

        # hidden 들의 평균
        #proj = (proj * x_mask[:, :, None]).sum(axis=0)
        #proj = proj / x_mask.sum(axis=0)[:, None]  # sample * dim

        # 마지막 hidden
        proj = proj[-1]  # sample * dim

        output = T.dot(proj, self.W_pred) + self.b_pred

        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)

        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return x, x_mask, img, y, cost, updates, prediction
Exemplo n.º 9
0
    def build_model(self, lr=0.001, dropout=None):
        def concatenate(tensor_list, axis=0):
            concat_size = sum(tt.shape[axis] for tt in tensor_list)
            output_shape = ()
            for k in range(axis):
                output_shape += (tensor_list[0].shape[k],)
            output_shape += (concat_size,)
            for k in range(axis + 1, tensor_list[0].ndim):
                output_shape += (tensor_list[0].shape[k],)
            out = tensor.zeros(output_shape)
            offset = 0
            for tt in tensor_list:
                indices = ()
                for k in range(axis):
                    indices += (slice(None),)
                indices += (slice(offset, offset + tt.shape[axis]),)
                for k in range(axis + 1, tensor_list[0].ndim):
                    indices += (slice(None),)
        
                out = tensor.set_subtensor(out[indices], tt)
                offset += tt.shape[axis]
        
            return out


    
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.matrix('x', dtype = 'int32')  # step * samples
        x_mask = T.matrix('x_mask', dtype='float32')  # step * samples
        y = T.matrix('y', dtype = 'int32')  # sample * emb
        ctx = T.tensor3('ctx', dtype = 'float32')  # sample * annotation * dim
        
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        xr = x[::-1]
        xr_mask = x_mask[::-1]
        
        emb = self.W_emb[x.flatten()]
        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])

        embr = self.W_emb[xr.flatten()]
        embr = embr.reshape([n_timesteps, n_samples, self.dim_word])
        
        ctx0 = ctx
        ctx_mean = ctx0.mean(1)
        
        init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init
                
             
        # proj : gru hidden 들의 리스트   
        proj = self.gru_layer(emb, mask=x_mask, context=ctx, init_state=init_state)
        proj_h = proj[0]

        projr = self.gru_layer(embr, mask=xr_mask, context=ctx, init_state=init_state)
        projr_h = projr[0]


        concat_proj_h = concatenate([proj_h, projr_h[::-1]], axis=proj_h.ndim-1)
        # step_ctx : step * samples * (dim*2)
        concat_proj_h = (concat_proj_h * x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None]
        # step_ctx_mean : samples * (dim*2)


        if dropout is not None :
            concat_proj_h = dropout_layer(concat_proj_h, use_noise, trng, dropout)


        
        output = T.dot(concat_proj_h, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        
        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
    
    
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
Exemplo n.º 10
0
    def build_model(self, lr=0.001, dropout=None):
    
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.matrix('x', dtype = 'int32')  # step * samples
        x_mask = T.matrix('x_mask', dtype='float32')  # step * samples
        y = T.matrix('y', dtype = 'int32')  # sample * emb
        ctx = T.tensor3('ctx', dtype = 'float32')  # sample * annotation * dim
        
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]


        emb = self.W_emb[x.flatten()]
        
        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])
        
        ctx0 = ctx
        ctx_mean = ctx0.mean(1)
        
        init_state = T.dot(ctx_mean, self.W_hidden_init) + self.b_hidden_init
        init_memory = T.dot(ctx_mean, self.W_memory_init) + self.b_memory_init
           
             
        # proj : lstm hidden 들의 리스트   
        proj = self.lstm_layer(emb, mask=x_mask, context=ctx, init_state=init_state, init_memory=init_memory)
        proj_h = proj[0]
        
        # hidden 들의 평균
        proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0)
        proj_h = proj_h / x_mask.sum(axis=0)[:, None]  # sample * dim
        
        # 마지막 hidden
        #proj_h = proj_h[-1]  # sample * dim
        

        if dropout is not None :
            proj_h = dropout_layer(proj_h, use_noise, trng, dropout)

        
        output = T.dot(proj_h, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        
        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
    
    
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction
Exemplo n.º 11
0
    def build_model(self, lr=0.001, dropout=None):
    
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.tensor3('x', dtype = 'float32') # step * sample * 5555
        y = T.matrix('y', dtype = 'int32')
        img = T.tensor3('img', dtype = 'float32') #  1*sample * 4096
        



#        T.set_subtensor(img_t3[0], img)

 #       emb=theano.tensor.concatenate([img_t3,x])
        emb = x
        embr = x[::-1]

        # proj : gru hidden 들의 리스트   
        proj = self.gru_layer(emb, img)
        projr = self.gru_cond_layer(embr, img)
        
        proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1)

        # hidden 들의 평균
        proj = proj.mean(axis=0)
        
        # 마지막 hidden
        #proj = proj[-1]  # sample * dim
        

        if dropout is not None :
            proj = dropout_layer(proj, use_noise, trng, dropout)
            
            
        output = T.dot(proj, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        
        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
        
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)

        '''
        decay_c = 0.000001
        # add L2 regularization costs
        if decay_c > 0.:
            decay_c = theano.shared(np.float32(decay_c), name='decay_c')
            weight_decay = 0.
            for vv in self.params:
                weight_decay += (vv ** 2).sum()
            weight_decay *= decay_c
            cost += weight_decay
    
        '''
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, img, y, cost, updates, prediction, probs
Exemplo n.º 12
0
    def build_model(self, lr=0.001, dropout=None):
    
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.tensor3('x', dtype = 'float32')
        y = T.matrix('y', dtype = 'int32')
        img = T.matrix('img', dtype = 'float32')
        
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        
        init_state = T.dot(img, self.W_hidden_init) + self.b_hidden_init
        init_memory = T.dot(img, self.W_memory_init) + self.b_memory_init
        
        
        emb = x
        embr = x.swapaxes(0,1)[::-1].swapaxes(0,1)



        # proj : gru hidden 들의 리스트   
        proj = self.lstm_layer(emb, init_state=init_state, init_memory=init_memory)[0]
        projr = self.lstm_layer(embr, init_state=init_state, init_memory=init_memory)[0]
        
        proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1)

        # hidden 들의 평균
        proj = proj.mean(axis=0)
        
        # 마지막 hidden
        #proj = proj[-1]  # sample * dim
        

        if dropout is not None :
            proj = dropout_layer(proj, use_noise, trng, dropout)
            
            
        output = T.dot(proj, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        
        ## avoid NaN
        epsilon = 1.0e-8
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
        
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)

        '''
        decay_c = 0.000001
        # add L2 regularization costs
        if decay_c > 0.:
            decay_c = theano.shared(np.float32(decay_c), name='decay_c')
            weight_decay = 0.
            for vv in self.params:
                weight_decay += (vv ** 2).sum()
            weight_decay *= decay_c
            cost += weight_decay
    
        '''
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, img, y, cost, updates, prediction
Exemplo n.º 13
0
b_h3 = shared_normal((2, n_h3), sigma = 0)
b_h4 = shared_normal((2, n_h4), sigma = 0)

X = binomial(X) #internal binarization
#model calls	
[dout_prob_y, dout_dual_recon_err] = model_NG_ACE(X, batch_size, gaussian_err, 0.2, 0.5)  #with dropout
[prob_y, dual_recon_err] = model_NG_ACE(X, batch_size, gaussian_err, 0., 0.) #without dropout
	
y_model = T.argmax(prob_y, axis=1) #model labels
#dropout classification err
dout_class_err =  T.nnet.categorical_crossentropy(dout_prob_y, Y).sum() 

#optimizer	call
cost = dout_class_err + dout_dual_recon_err 
params = [W_h, W_h2, W_h3, W_h4, W_o, b_h, b_h2, b_h3, b_h4] 
updates, norm_grad = adam(cost, params, lr = learning_rate, data_part = float(batch_size) / P)

#givens
s_trX, s_teX, s_trY, s_teY = shared(trX), shared(teX), shared(trY), shared(teY)   
tr_batch_X = s_trX[start : end]
tr_batch_Y = s_trY[start : end]
te_batch_X = s_teX[start : end]
te_batch_Y = s_teY[start : end] 
	
#train & test functions
mode = theano.compile.get_default_mode()	
train = theano.function(inputs=[start, end, learning_rate],  outputs= [dout_class_err, dual_recon_err,   y_model, norm_grad], updates=updates,  givens = {X : tr_batch_X, Y : tr_batch_Y}, allow_input_downcast=True, mode = mode) 
test = theano.function(inputs=[start, end], outputs=  [dual_recon_err,    y_model], givens = {X : te_batch_X}, allow_input_downcast=True,  mode = mode) 

#main loop over epochs	
tr_len = len(trY)
Exemplo n.º 14
0
def main():
    logging.info("️start loading setting data.")
    settings = LearningDataSettings(args.train_setting_file)
    logging.info("☑ loading setting data complete.")

    vector_size = settings.input_unit
    hidden_unit_num = settings.hidden_unit
    class_num = settings.class_unit
    hidden_layer_value_num = args.division_num

    logging.info(
        "input_vector(n):%d, hidden_unit(m):%d, class_num(K):%d, div_num:%d" %
        (vector_size, hidden_unit_num, class_num, hidden_layer_value_num))

    drbm = DRBM.load_from_json(settings.initial_model,
                               args.division_num,
                               args.sparse,
                               sparse_learning_rate=args.sparse_learning_rate,
                               sparse_adamax=args.sparse_adamax)
    logging.info("initial model: {}".format(str(drbm)))

    if args.datasize_limit != 0 and not args.generative_model:
        settings.training_data = settings.training_data.restore_minibatch(
            args.datasize_limit, random=False)

    gen_drbm = None
    if args.kl_divergence:
        gen_drbm = DRBM.load_from_json(args.kl_divergence)
        logging.info("generative model: {}".format(str(gen_drbm)))
    elif args.generative_model:
        gen_drbm = DRBM(settings.gen_input,
                        settings.gen_hidden,
                        settings.gen_class,
                        0,
                        random_bias=True)
        logging.info("generated generative model: {}".format(str(gen_drbm)))
        value, target = gen_drbm.stick_break(args.datasize_limit)
        settings.training_data = Categorical(value, target, class_num)
        settings.test_data = Categorical(np.array([]), np.array([]), class_num)

    opt = None
    if args.optimizer == "momentum":
        logging.info("optimize method: momentum")
        opt = optimizer.momentum(vector_size, hidden_unit_num, class_num)
    elif args.optimizer == "adam":
        logging.info("optimize method: adam")
        opt = optimizer.adam(vector_size, hidden_unit_num, class_num)
    else:
        logging.info("optimize method: adamax")
        opt = optimizer.adamax(vector_size, hidden_unit_num, class_num)

    logging.info("train started.")
    start_time = time.time()

    learning_result = drbm.train(
        settings.training_data,
        settings.test_data,
        args.learning_num,
        args.minibatch_size,
        opt,
        test_interval=args.test_interval,
        correct_rate=args.correct_rate,
        gen_drbm=gen_drbm,
    )

    end_time = time.time()
    logging.info("☑ train complete. time: {} sec".format(end_time -
                                                         start_time))

    now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    hidden_layer = "s" if args.sparse else "d"
    filename_template = "{}_{}_{}{}_v{}h{}c{}_%s.json".format(
        now, args.filename_prefix, hidden_layer, drbm.div_num,
        drbm.num_visible, drbm.num_hidden, drbm.num_class)
    learning_result.save(
        os.path.join(args.result_directory, filename_template % "log"))

    drbm.save(os.path.join(args.result_directory,
                           filename_template % "params"))
    logging.info("☑ parameters dumped.")
Exemplo n.º 15
0
def main(method, LR_start, Binarize_weight_only):

    # BN parameters
    name = "mnist"
    print("dataset = " + str(name))

    print("Binarize_weight_only=" + str(Binarize_weight_only))

    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    batch_size = 100
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    # network structure
    num_units = 2048
    print("num_units = " + str(num_units))
    n_hidden_layers = 3
    print("n_hidden_layers = " + str(n_hidden_layers))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))

    if Binarize_weight_only == "w":
        activation = lasagne.nonlinearities.rectify
    else:
        activation = lab.binary_tanh_unit
    print("activation = " + str(activation))

    print('Loading MNIST dataset...')

    train_set = MNIST(which_set='train', start=0, stop=50000, center=True)
    valid_set = MNIST(which_set='train', start=50000, stop=60000, center=True)
    test_set = MNIST(which_set='test', center=True)

    # bc01 format
    train_set.X = train_set.X.reshape(-1, 1, 28, 28)
    valid_set.X = valid_set.X.reshape(-1, 1, 28, 28)
    test_set.X = test_set.X.reshape(-1, 1, 28, 28)

    # flatten targets
    train_set.y = np.hstack(train_set.y)
    valid_set.y = np.hstack(valid_set.y)
    test_set.y = np.hstack(test_set.y)

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Building the MLP...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input)

    for k in range(n_hidden_layers):
        mlp = lab.DenseLayer(mlp,
                             nonlinearity=lasagne.nonlinearities.identity,
                             num_units=num_units,
                             method=method)
        mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

    mlp = lab.DenseLayer(mlp,
                         nonlinearity=lasagne.nonlinearities.identity,
                         num_units=10,
                         method=method)

    mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(mlp, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":

        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = lab.compute_grads(loss, mlp)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = lab.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(mlp, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(mlp, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay,
              num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y,
              test_set.X, test_set.y)
def main(method,LR_start, SEQ_LENGTH):

	lasagne.random.set_rng(np.random.RandomState(1))

	name = "linux"
	print("dataset = "+str(name))

	print("Method = "+str(method))

	# Sequence Length
	SEQ_LENGTH = SEQ_LENGTH
	# SEQ_LENGTH = 100  #can have diffvalues 50, 100, 200
	print("SEQ_LENGTH = "+str(SEQ_LENGTH))

	# Number of units in the two hidden (LSTM) layers
	N_HIDDEN = 512
	print("N_HIDDEN = "+str(N_HIDDEN))

	# All gradients above this will be clipped
	GRAD_CLIP=5.  #### this clip the gradients at every time step, while T.clip clips the sum of gradients as a whole
	print("GRAD_CLIP ="+str(GRAD_CLIP))

	# Number of epochs to train the net
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	# Batch Size
	batch_size = 100
	print("batch_size = "+str(batch_size))
	 
	print("LR_start = "+str(LR_start))
	LR_decay = 0.98
	print("LR_decay="+str(LR_decay))

	activation = lasagne.nonlinearities.tanh


	## load data, change data file dir
	with open('data/linux_input.txt', 'r') as f:
		in_text = f.read()

	generation_phrase = "Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar\n *\n * This file contains the interrupt probing code and driver APIs.\n */\n\n#include"
	#This snippet loads the text file and creates dictionaries to 
	#encode characters into a vector-space representation and vice-versa. 
	chars = list(set(in_text))
	data_size, vocab_size = len(in_text), len(chars)
	char_to_ix = { ch:i for i,ch in enumerate(chars) }
	ix_to_char = { i:ch for i,ch in enumerate(chars) }

	num_splits = [0.9, 0.05, 0.05]
	num_splits_all = np.floor(data_size/batch_size/SEQ_LENGTH)
	num_train = np.floor(num_splits_all*num_splits[0])
	num_val   = np.floor(num_splits_all*num_splits[1])
	num_test  = num_splits_all - num_train - num_val

	train_X = in_text[0:(num_train*batch_size*SEQ_LENGTH+1).astype('int32')]
	val_X = in_text[(num_train*batch_size*SEQ_LENGTH).astype('int32'):((num_train+num_val)*batch_size*SEQ_LENGTH+1).astype('int32')]
	test_X = in_text[((num_train+num_val)*batch_size*SEQ_LENGTH).astype('int32'):(num_splits_all*batch_size*SEQ_LENGTH+1).astype('int32')]


	## build model
	print('Building the model...') 
		
	# input = T.tensor3('inputs')
	target = T.imatrix('target')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	# (batch size, SEQ_LENGTH, num_features)
	l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))
	l_forward_2 = laq.LSTMLayer(
				l_in, 
				num_units=N_HIDDEN,
				grad_clipping=GRAD_CLIP,
				peepholes=False,
				nonlinearity=activation, ### change this activation can change the hidden layer to binary
				method=method)   ### batch_size*SEQ_LENGTH*N_HIDDEN

	l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN))  ## (batch_size*SEQ_LENGTH, N_HIDDEN)
	l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)
	batchsize, seqlen, _ = l_in.input_var.shape

	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	loss = T.nnet.categorical_crossentropy(train_output,target.flatten()).mean()


	if method!= "FPN": 
		# W updates
		W = lasagne.layers.get_all_params(l_out, quantized=True)
		W_grads = laq.compute_grads(loss,l_out) 
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR, epsilon=1e-8) 
		updates = laq.clipping_scaling(updates,l_out)

		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items())


		## update the ternary matrix
		ternary_weights = laq.get_quantized_weights(loss, l_out)
		updates2 = OrderedDict()
		idx = 0
		tt_tag = lasagne.layers.get_all_params(l_out, tt=True)	
		for tt_tag_temp in tt_tag:
			updates2[tt_tag_temp]= ternary_weights[idx]
			idx = idx+1
		updates = OrderedDict(updates.items() + updates2.items())

		## update 2nd momentum
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999
		for acc_tag_temp in acc_tag:
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())


	else:
		params_other = lasagne.layers.get_all_params(l_out, trainable=True)
		
		W_grads = [theano.grad(loss, wrt=l_forward_2.W_in_to_ingate), theano.grad(loss, wrt=l_forward_2.W_hid_to_ingate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_forgetgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_forgetgate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_cell),theano.grad(loss, wrt=l_forward_2.W_hid_to_cell),
		theano.grad(loss, wrt=l_forward_2.W_in_to_outgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_outgate)]
		
		updates = optimizer.adam(loss_or_grads=loss, params=params_other, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = T.nnet.categorical_crossentropy(test_output,target.flatten()).mean()
			
	train_fn = theano.function([l_in.input_var, target, LR], loss, updates=updates, allow_input_downcast=True)
	val_fn = theano.function([l_in.input_var, target], test_loss, allow_input_downcast=True)

	
	print('Training...')
	
	X_train = train_X
	X_val = val_X
	X_test = test_X

	def gen_data(pp, batch_size,SEQ_LENGTH, data, return_target=True):

		x = np.zeros((batch_size,SEQ_LENGTH,vocab_size))   ###### 128*100*85
		y = np.zeros((batch_size, SEQ_LENGTH))

		for n in range(batch_size):
			# ptr = n
			for i in range(SEQ_LENGTH):
				x[n,i,char_to_ix[data[pp[n]*SEQ_LENGTH+i]]] = 1.
				y[n,i] = char_to_ix[data[pp[n]*SEQ_LENGTH+i+1]]
		return x, np.array(y,dtype='int32')    

	in_text = X_train+X_val+X_test
	chars = list(set(in_text))
	data_size, vocab_size = len(in_text), len(chars)
	char_to_ix = { ch:i for i,ch in enumerate(chars) }
	ix_to_char = { i:ch for i,ch in enumerate(chars) }
	
	def train_epoch(X,LR):
		
		loss = 0        
		batches = len(X)/batch_size/SEQ_LENGTH
		num_seq = len(X)/SEQ_LENGTH
		shuffled_ind = range(num_seq)

		np.random.shuffle(shuffled_ind)
		
		for i in range(batches):
			tmp_ind = shuffled_ind[i*batch_size:(i+1)*batch_size]
			xx,yy = gen_data(tmp_ind,batch_size,SEQ_LENGTH, X)
			new_loss = train_fn(xx,yy,LR)
			loss+=new_loss

		loss=loss/batches
		
		return loss
	
	# This function tests the model a full epoch (on the whole dataset)
	def val_epoch(X):
		
		# err = 0
		loss = 0
		batches = len(X)/batch_size/SEQ_LENGTH

		num_seq = len(X)/SEQ_LENGTH
		ind = range(num_seq)
		for i in range(batches):
			tmp_ind = ind[i*batch_size:(i+1)*batch_size]
			xx, yy = gen_data(tmp_ind, batch_size, SEQ_LENGTH, X)
			new_loss = val_fn(xx,yy)
			loss += new_loss
		
		loss = loss/batches

		return loss
	
	best_val_loss=100
	best_epoch = 1
	LR = LR_start

	# iterate over epochs:
	for epoch in range(1,num_epochs+1):		
		start_time = time.time()
		train_loss = train_epoch(X_train, LR)

		val_loss = val_epoch(X_val)
		
		# test if validation error went down
		if val_loss <= best_val_loss:
			
			best_val_loss = val_loss
			best_epoch = epoch
			
			test_loss = val_epoch(X_test)

			# all_params = lasagne.layers.get_all_params(l_out)
			# np.savez("{0}/{1}_seq{2}_lr{3}_hid{4}_{5}.npz".format(method, name, SEQ_LENGTH, LR_start, N_HIDDEN, method), *all_params)		

		epoch_duration = time.time() - start_time
		# Then we print the results for this epoch:
		print("  Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s")
		print("  LR:                            "+str(LR))
		print("  training loss:                 "+str(train_loss))
		print("  validation loss:               "+str(val_loss))
		print("  best epoch:                    "+str(best_epoch))
		print("  test loss:                     "+str(test_loss))
		
		with open("{0}/{1}_seq{2}_lr{3}_hid{4}_{5}.txt".format(method, name, SEQ_LENGTH, LR_start, N_HIDDEN, method), "a") as myfile:
			myfile.write("{0}  {1:.3f} {2:.3f} {3:.3f} {4:.3f}\n".format(epoch, train_loss, val_loss, 
				test_loss, epoch_duration))

		# learning rate update scheme
		if epoch>10:
			LR *= LR_decay
Exemplo n.º 17
0
def main(method,LR_start,Binarize_weight_only, SEQ_LENGTH):

	lasagne.random.set_rng(np.random.RandomState(1))

	name = "linux"
	print("dataset = "+str(name))

	print("Binarize_weight_only="+str(Binarize_weight_only))

	print("Method = "+str(method))

	# Sequence Length
	SEQ_LENGTH = SEQ_LENGTH
	# SEQ_LENGTH = 100  #can have diffvalues 50, 100, 200
	print("SEQ_LENGTH = "+str(SEQ_LENGTH))

	# Number of units in the two hidden (LSTM) layers
	N_HIDDEN = 512
	print("N_HIDDEN = "+str(N_HIDDEN))

	# All gradients above this will be clipped
	GRAD_CLIP=5.  #### this clip the gradients at every time step, while T.clip clips the sum of gradients as a whole
	print("GRAD_CLIP ="+str(GRAD_CLIP))

	# Number of epochs to train the net
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	# Batch Size
	batch_size = 100
	print("batch_size = "+str(batch_size))
	 
	print("LR_start = "+str(LR_start))
	LR_decay = 0.98
	print("LR_decay="+str(LR_decay))

	if Binarize_weight_only =="w":
		activation = lasagne.nonlinearities.tanh
	else:
		activation = lab.binary_tanh_unit
	print("activation = "+ str(activation))

	name = name+"_"+Binarize_weight_only

	## load data, change data file dir
	with open('data/linux_input.txt', 'r') as f:
		in_text = f.read()

	generation_phrase = "Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar\n *\n * This file contains the interrupt probing code and driver APIs.\n */\n\n#include"
	#This snippet loads the text file and creates dictionaries to 
	#encode characters into a vector-space representation and vice-versa. 
	chars = list(set(in_text))
	data_size, vocab_size = len(in_text), len(chars)
	char_to_ix = { ch:i for i,ch in enumerate(chars) }
	ix_to_char = { i:ch for i,ch in enumerate(chars) }

	num_splits = [0.9, 0.05, 0.05]
	num_splits_all = np.floor(data_size/batch_size/SEQ_LENGTH)
	num_train = np.floor(num_splits_all*num_splits[0])
	num_val   = np.floor(num_splits_all*num_splits[1])
	num_test  = num_splits_all - num_train - num_val

	train_X = in_text[0:(num_train*batch_size*SEQ_LENGTH+1).astype('int32')]
	val_X = in_text[(num_train*batch_size*SEQ_LENGTH).astype('int32'):((num_train+num_val)*batch_size*SEQ_LENGTH+1).astype('int32')]
	test_X = in_text[((num_train+num_val)*batch_size*SEQ_LENGTH).astype('int32'):(num_splits_all*batch_size*SEQ_LENGTH+1).astype('int32')]


	## build model
	print('Building the model...') 
		
	# input = T.tensor3('inputs')
	target = T.imatrix('target')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	# (batch size, SEQ_LENGTH, num_features)
	l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))
	l_forward_2 = lab.LSTMLayer(
				l_in, 
				num_units=N_HIDDEN,
				grad_clipping=GRAD_CLIP,
				peepholes=False,
				nonlinearity=activation, ### change this activation can change the hidden layer to binary
				method=method)   ### batch_size*SEQ_LENGTH*N_HIDDEN

	l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN))  ## (batch_size*SEQ_LENGTH, N_HIDDEN)
	l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)
	batchsize, seqlen, _ = l_in.input_var.shape
	l_shp1 = lasagne.layers.ReshapeLayer(l_out, (batchsize, seqlen, vocab_size))
	l_out1 = lasagne.layers.SliceLayer(l_shp1, -1, 1)

	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	loss = T.nnet.categorical_crossentropy(train_output,target.flatten()).mean()


	if method!= "FPN": 
		# W updates
		W = lasagne.layers.get_all_params(l_out, binary=True)
		W_grads = lab.compute_grads(loss,l_out) 
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR, epsilon = 1e-8)   ### can choose different methods to update
		updates = lab.clipping_scaling(updates,l_out)

		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items())

		## update 2 momentum
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999
		for acc_tag_temp in acc_tag:
			# updates3[acc_tag_temp]=updates.keys()[idx]
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())

	else:
		params_other = lasagne.layers.get_all_params(l_out, trainable=True)
		
		W_grads = [theano.grad(loss, wrt=l_forward_2.W_in_to_ingate), theano.grad(loss, wrt=l_forward_2.W_hid_to_ingate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_fotgetgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_forgetgate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_cell),theano.grad(loss, wrt=l_forward_2.W_hid_to_cell),
		theano.grad(loss, wrt=l_forward_2.W_in_to_outgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_outgate)]
		
		updates = optimizer.adam(loss_or_grads=loss, params=params_other, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = T.nnet.categorical_crossentropy(test_output,target.flatten()).mean()
			


	train_fn = theano.function([l_in.input_var, target, LR], [loss, W_grads[5]], updates=updates, allow_input_downcast=True)
	val_fn = theano.function([l_in.input_var, target], test_loss, allow_input_downcast=True)
	probs = theano.function([l_in.input_var],lasagne.layers.get_output(l_out1), allow_input_downcast=True)

	
	print('Training...')
	
	lab.train(
			name, method,
			train_fn,val_fn,
			batch_size,
			SEQ_LENGTH,
			N_HIDDEN,
			LR_start,LR_decay,
			num_epochs,
			train_X,
			val_X,
			test_X)
Exemplo n.º 18
0
    def build_model(self, lr=0.001, dropout=None):
        def concatenate(tensor_list, axis=0):
            concat_size = sum(tt.shape[axis] for tt in tensor_list)
            output_shape = ()
            for k in range(axis):
                output_shape += (tensor_list[0].shape[k],)
            output_shape += (concat_size,)
            for k in range(axis + 1, tensor_list[0].ndim):
                output_shape += (tensor_list[0].shape[k],)
            out = T.zeros(output_shape)
            offset = 0
            for tt in tensor_list:
                indices = ()
                for k in range(axis):
                    indices += (slice(None),)
                indices += (slice(offset, offset + tt.shape[axis]),)
                for k in range(axis + 1, tensor_list[0].ndim):
                    indices += (slice(None),)
        
                out = T.set_subtensor(out[indices], tt)
                offset += tt.shape[axis]
        
            return out
            
            
        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))
    
        # description string: #words x #samples


        x = T.matrix('x', dtype = 'int32')
        x_mask = T.matrix('x_mask', dtype='float32')
        y = T.matrix('y', dtype = 'int32')
        img = T.matrix('img', dtype = 'float32')
        
        n_timesteps = x.shape[0]
        n_samples = x.shape[1]


        init_state = T.dot(img, self.W_img_emb) + self.b_img_emb
        emb = self.W_emb[x.flatten()]
        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])
        
        xr = x[::-1]
        xr_mask = x_mask[::-1]
        
        embr = self.W_emb[xr.flatten()]
        embr = embr.reshape([n_timesteps, n_samples, self.dim_word])       

        proj = self.gru_layer(emb, init_state, mask=x_mask)
        projr = self.gru_layer(embr, init_state, mask=xr_mask)
    
        proj = concatenate([proj, projr[::-1]], axis=proj.ndim-1)
       
        # hidden 들의 평균
        proj = (proj * x_mask[:, :, None]).sum(axis=0)
        proj = proj / x_mask.sum(axis=0)[:, None]  # sample * dim
        
        # 마지막 hidden
        #proj = proj[-1]  # sample * dim
        

        if dropout is not None :
            proj = dropout_layer(proj, use_noise, trng, dropout)
        
        output = T.dot(proj, self.W_pred) + self.b_pred
        
        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)
        
        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN
        
        
        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)
        
        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, img, y, cost, updates, prediction
Exemplo n.º 19
0
def main(method, LR_start, Binarize_weight_only):

    name = "svhn"
    print("dataset = " + str(name))

    print("Binarize_weight_only=" + str(Binarize_weight_only))

    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    batch_size = 50
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    if Binarize_weight_only == "w":
        activation = lasagne.nonlinearities.rectify
    else:
        activation = lab.binary_tanh_unit
    print("activation = " + str(activation))

    ## number of filters in the first convolutional layer
    K = 64
    print("K=" + str(K))

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
    l_cnn1 = lab.Conv2DLayer(l_in,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha)

    l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation)

    l_cnn2 = lab.Conv2DLayer(l_nl1,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))

    l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha)

    l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation)
    # 256C3-256C3-P2
    l_cnn3 = lab.Conv2DLayer(l_nl2,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha)

    l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation)

    l_cnn4 = lab.Conv2DLayer(l_nl3,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))

    l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha)

    l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation)

    # 512C3-512C3-P2
    l_cnn5 = lab.Conv2DLayer(l_nl4,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha)

    l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation)

    l_cnn6 = lab.Conv2DLayer(l_nl5,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))

    l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha)

    l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation)

    # print(cnn.output_shape)

    # 1024FP-1024FP-10FP
    l_dn1 = lab.DenseLayer(l_nl6,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha)

    l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation)

    l_dn2 = lab.DenseLayer(l_nl7,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha)

    l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation)

    l_dn3 = lab.DenseLayer(l_nl8,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=10,
                           method=method)

    l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":
        # W updates
        W = lasagne.layers.get_all_params(l_out, binary=True)
        W_grads = lab.compute_grads(loss, l_out)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = lab.clipping_scaling(updates, l_out)

        # other parameters updates
        params = lasagne.layers.get_all_params(l_out,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(l_out, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())
    else:
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(l_out, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)
    val_fn = theano.function([input, target], [test_loss, test_err])

    ## load data
    print('Loading SVHN dataset')

    train_set = SVHN(
        which_set='splitted_train',
        # which_set= 'valid',
        path="${SVHN_LOCAL_PATH}",
        axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    train_set.y = np.array(train_set.y).flatten()
    valid_set.y = np.array(valid_set.y).flatten()
    test_set.y = np.array(test_set.y).flatten()

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Training...')

    # ipdb.set_trace()
    lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay,
              num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y,
              test_set.X, test_set.y)
Exemplo n.º 20
0
def main(method, LR_start):

    name = "svhn"
    print("dataset = " + str(name))
    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    batch_size = 50
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    activation = lasagne.nonlinearities.rectify

    # number of filters in the first convolutional layer
    K = 64
    print("K=" + str(K))

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
    l_cnn1 = laq.Conv2DLayer(l_in,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha)

    l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation)

    l_cnn2 = laq.Conv2DLayer(l_nl1,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))

    l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha)

    l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation)
    # 256C3-256C3-P2
    l_cnn3 = laq.Conv2DLayer(l_nl2,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha)

    l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation)

    l_cnn4 = laq.Conv2DLayer(l_nl3,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))

    l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha)

    l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation)

    # 512C3-512C3-P2
    l_cnn5 = laq.Conv2DLayer(l_nl4,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha)

    l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation)

    l_cnn6 = laq.Conv2DLayer(l_nl5,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))

    l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha)

    l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation)

    # print(cnn.output_shape)

    # 1024FP-1024FP-10FP
    l_dn1 = laq.DenseLayer(l_nl6,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha)

    l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation)

    l_dn2 = laq.DenseLayer(l_nl7,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha)

    l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation)

    l_dn3 = laq.DenseLayer(l_nl8,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=10,
                           method=method)

    l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":
        # W updates
        W = lasagne.layers.get_all_params(l_out, quantized=True)
        W_grads = laq.compute_grads(loss, l_out)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = laq.clipping_scaling(updates, l_out)

        # other parameters updates
        params = lasagne.layers.get_all_params(l_out,
                                               trainable=True,
                                               quantized=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        ternary_weights = laq.get_quantized_weights(loss, l_out)
        updates2 = OrderedDict()
        idx = 0
        tt_tag = lasagne.layers.get_all_params(l_out, tt=True)
        for tt_tag_temp in tt_tag:
            updates2[tt_tag_temp] = ternary_weights[idx]
            idx = idx + 1
        updates = OrderedDict(updates.items() + updates2.items())

        ## update 2nd momentum
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(l_out, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())

    else:
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(l_out, deterministic=True)

    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    train_fn = theano.function([input, target, LR], loss, updates=updates)

    val_fn = theano.function([input, target], [test_loss, test_err])

    ## load data
    print('Loading SVHN dataset')

    train_set = SVHN(
        which_set='splitted_train',
        # which_set= 'valid',
        path="${SVHN_LOCAL_PATH}",
        axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    train_set.y = np.array(train_set.y).flatten()
    valid_set.y = np.array(valid_set.y).flatten()
    test_set.y = np.array(test_set.y).flatten()

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Training...')

    X_train = train_set.X
    y_train = train_set.y
    X_val = valid_set.X
    y_val = valid_set.y
    X_test = test_set.X
    y_test = test_set.y

    # This function trains the model a full epoch (on the whole dataset)
    def train_epoch(X, y, LR):

        loss = 0
        batches = len(X) / batch_size
        # move shuffle here to save memory
        # k = 5
        # batches = int(batches/k)*k
        shuffled_range = range(len(X))
        np.random.shuffle(shuffled_range)

        for i in range(batches):
            tmp_ind = shuffled_range[i * batch_size:(i + 1) * batch_size]
            newloss = train_fn(X[tmp_ind], y[tmp_ind], LR)
            loss += newloss
        loss /= batches
        return loss

    # This function tests the model a full epoch (on the whole dataset)
    def val_epoch(X, y):

        err = 0
        loss = 0
        batches = len(X) / batch_size

        for i in range(batches):
            new_loss, new_err = val_fn(X[i * batch_size:(i + 1) * batch_size],
                                       y[i * batch_size:(i + 1) * batch_size])
            err += new_err
            loss += new_loss

        err = err / batches * 100
        loss /= batches

        return err, loss

    best_val_err = 100
    best_epoch = 1
    LR = LR_start
    # We iterate over epochs:
    for epoch in range(1, num_epochs + 1):

        start_time = time.time()
        train_loss = train_epoch(X_train, y_train, LR)

        val_err, val_loss = val_epoch(X_val, y_val)

        # test if validation error went down
        if val_err <= best_val_err:

            best_val_err = val_err
            best_epoch = epoch

            test_err, test_loss = val_epoch(X_test, y_test)

        epoch_duration = time.time() - start_time

        # Then we print the results for this epoch:
        print("Epoch " + str(epoch) + " of " + str(num_epochs) + " took " +
              str(epoch_duration) + "s")
        print("  LR:                            " + str(LR))
        print("  training loss:                 " + str(train_loss))
        print("  validation loss:               " + str(val_loss))
        print("  validation error rate:         " + str(val_err) + "%")
        print("  best epoch:                    " + str(best_epoch))
        print("  best validation error rate:    " + str(best_val_err) + "%")
        print("  test loss:                     " + str(test_loss))
        print("  test error rate:               " + str(test_err) + "%")

        with open(
                "{0}/{1}_lr{2}_{3}.txt".format(method, name, LR_start, method),
                "a") as myfile:
            myfile.write(
                "{0}  {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n"
                .format(epoch, train_loss, val_loss, test_loss, val_err,
                        test_err, epoch_duration, LR))

        ## Learning rate update scheme
        if epoch == 15 or epoch == 25:
            LR *= LR_decay
def main(method,LR_start):
	
	name = "cifar100"
	print("dataset = "+str(name))

	print("Method = "+str(method))

	# alpha is the exponential moving average factor
	alpha = .1
	print("alpha = "+str(alpha))
	epsilon = 1e-4
	print("epsilon = "+str(epsilon))
	
	# Training parameters
	batch_size = 100
	print("batch_size = "+str(batch_size))
	
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	print("LR_start = "+str(LR_start))
	LR_decay = 0.5
	print("LR_decay="+str(LR_decay))

	activation = lasagne.nonlinearities.rectify
	

	train_set_size = 45000
	print("train_set_size = "+str(train_set_size))
	
	print('Loading CIFAR-100 dataset...')
	
	preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/preprocessor.pkl")
	train_set = ZCA_Dataset(
		preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=0, stop = train_set_size)
	valid_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=45000, stop = 50000)  
	test_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar100/pylearn2_gcn_whitened/test.pkl"), 
		preprocessor = preprocessor)
		
	# bc01 format
	train_set.X = train_set.X.reshape(-1,3,32,32)
	valid_set.X = valid_set.X.reshape(-1,3,32,32)
	test_set.X = test_set.X.reshape(-1,3,32,32)
	
	# flatten targets
	train_set.y = np.int32(np.hstack(train_set.y))
	valid_set.y = np.int32(np.hstack(valid_set.y))
	test_set.y = np.int32(np.hstack(test_set.y))
   

	print('Building the CNN...') 
	
	# Prepare Theano variables for inputs and targets
	input = T.tensor4('inputs')
	target = T.ivector('targets')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	l_in = lasagne.layers.InputLayer(
			shape=(None, 3, 32, 32),
			input_var=input)
	
	# 128C3-128C3-P2             
	l_cnn1 = laq.Conv2DLayer(
			l_in, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)

	l_bn1 = batch_norm.BatchNormLayer(
			l_cnn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl1 = lasagne.layers.NonlinearityLayer(
			l_bn1,
			nonlinearity = activation)

	l_cnn2 = laq.Conv2DLayer(
			l_nl1, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))
	
	l_bn2 = batch_norm.BatchNormLayer(
			l_mp1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl2 = lasagne.layers.NonlinearityLayer(
			l_bn2,
			nonlinearity = activation)			
	# 256C3-256C3-P2             
	l_cnn3 = laq.Conv2DLayer(
			l_nl2, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn3 = batch_norm.BatchNormLayer(
			l_cnn3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl3 = lasagne.layers.NonlinearityLayer(
			l_bn3,
			nonlinearity = activation)
			
	l_cnn4 = laq.Conv2DLayer(
			l_nl3, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))
	
	l_bn4 = batch_norm.BatchNormLayer(
			l_mp2,
			epsilon=epsilon, 
			alpha=alpha)
	
	l_nl4 = lasagne.layers.NonlinearityLayer(
			l_bn4,
			nonlinearity = activation)

	# 512C3-512C3-P2              
	l_cnn5 = laq.Conv2DLayer(
			l_nl4, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn5 = batch_norm.BatchNormLayer(
			l_cnn5,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl5 = lasagne.layers.NonlinearityLayer(
			l_bn5,
			nonlinearity = activation)
				  
	l_cnn6 = laq.Conv2DLayer(
			l_nl5, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))
	
	l_bn6 = batch_norm.BatchNormLayer(
			l_mp3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl6 = lasagne.layers.NonlinearityLayer(
			l_bn6,
			nonlinearity = activation)

	# print(cnn.output_shape)
	
	# 1024FP-1024FP-10FP            
	l_dn1 = laq.DenseLayer(
				l_nl6, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn7 = batch_norm.BatchNormLayer(
			l_dn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl7 = lasagne.layers.NonlinearityLayer(
			l_bn7,
			nonlinearity = activation)

	l_dn2 = laq.DenseLayer(
				l_nl7, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn8 = batch_norm.BatchNormLayer(
			l_dn2,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl8 = lasagne.layers.NonlinearityLayer(
			l_bn8,
			nonlinearity = activation)

	l_dn3 = laq.DenseLayer(
				l_nl8, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=100,
				method = method)      

	l_out = lasagne.layers.NonlinearityLayer(l_dn3, nonlinearity=lasagne.nonlinearities.softmax) 



	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	loss = categorical_crossentropy(train_output, target).mean()


	if method!="FPN":
		# W updates
		W = lasagne.layers.get_all_params(l_out, quantized=True)
		W_grads = laq.compute_grads(loss,l_out)
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
		updates = laq.clipping_scaling(updates,l_out)
		
		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, quantized=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())

		## update 2nd moment, can get from the adam optimizer also
		ternary_weights = laq.get_quantized_weights(loss, l_out)
		updates2 = OrderedDict()
		idx = 0
		tt_tag = lasagne.layers.get_all_params(l_out, tt=True)	
		for tt_tag_temp in tt_tag:
			updates2[tt_tag_temp]= ternary_weights[idx]
			idx = idx+1
		updates = OrderedDict(updates.items() + updates2.items())

		## update 2nd momentum
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999   
		for acc_tag_temp in acc_tag:
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())	


	else:
		params = lasagne.layers.get_all_params(l_out, trainable=True)
		updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = categorical_crossentropy(test_output, target).mean()
	test_err = T.mean(T.neq(T.argmax(test_output, axis=1), target),dtype=theano.config.floatX)

	train_fn = theano.function([input, target, LR], loss, updates=updates)
	val_fn = theano.function([input, target], [test_loss, test_err])

	print('Training...')
	

	X_train = train_set.X
	y_train = train_set.y
	X_val = valid_set.X
	y_val = valid_set.y
	X_test = test_set.X
	y_test = test_set.y
	# This function trains the model a full epoch (on the whole dataset)
	def train_epoch(X,y,LR):
		
		loss = 0
		batches = len(X)/batch_size
		shuffled_range = range(len(X))
		np.random.shuffle(shuffled_range)

		for i in range(batches):
			tmp_ind = shuffled_range[i*batch_size:(i+1)*batch_size] 
			newloss = train_fn(X[tmp_ind],y[tmp_ind],LR) 
			loss +=newloss				

		loss/=batches		
		return loss
	
	# This function tests the model a full epoch (on the whole dataset)
	def val_epoch(X,y):
		
		err = 0
		loss = 0
		batches = len(X)/batch_size
		
		for i in range(batches):
			new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size])
			err += new_err
			loss += new_loss
		
		err = err / batches * 100
		loss /= batches

		return err, loss
	

	best_val_err = 100
	best_epoch = 1
	LR = LR_start
	# We iterate over epochs:
	for epoch in range(1, num_epochs+1):
		
		start_time = time.time()
		train_loss = train_epoch(X_train,y_train,LR)
		
		val_err, val_loss = val_epoch(X_val,y_val)
		
		# test if validation error went down
		if val_err <= best_val_err:
			
			best_val_err = val_err
			best_epoch = epoch
			test_err, test_loss = val_epoch(X_test,y_test)

		epoch_duration = time.time() - start_time
		
		# Then we print the results for this epoch:
		print("Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s")
		print("  LR:                            "+str(LR))
		print("  training loss:                 "+str(train_loss))
		print("  validation loss:               "+str(val_loss))
		print("  validation error rate:         "+str(val_err)+"%")
		print("  best epoch:                    "+str(best_epoch))
		print("  best validation error rate:    "+str(best_val_err)+"%")
		print("  test loss:                     "+str(test_loss))
		print("  test error rate:               "+str(test_err)+"%") 
		

		with open("{0}/{1}_lr{2}_{3}.txt".format(method, name,  LR_start, method), "a") as myfile:
			myfile.write("{0}  {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n".format(epoch, 
				train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR))


		if epoch % 15 ==0:
			LR*=LR_decay
Exemplo n.º 22
0
    def __init__(self,
                 x,
                 y,
                 l,
                 window,
                 opt,
                 lr,
                 init_emb,
                 dim_emb,
                 dim_hidden,
                 n_vocab,
                 L2_reg,
                 unit,
                 sim='cos',
                 n_layers=1,
                 activation=tanh):
        self.tr_inputs = [x, y, l]
        self.pr_inputs = [x, y, l]

        self.x = x  # 1D: batch_size * l * 2, 2D: window; elem=word_id
        self.y = y  # 1D: batch_size; elem=label
        self.l = l  # scalar: elem=sentence length

        batch_size = y.shape[0]
        n_cands = x.shape[0] / batch_size / l

        self.pad = build_shared_zeros((1, dim_emb))
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab - 1, dim_emb))
        else:
            self.emb = theano.shared(init_emb)
        self.E = T.concatenate([self.pad, self.emb], 0)
        self.W_out = theano.shared(sample_weights(dim_hidden, dim_hidden))
        self.params = [self.emb, self.W_out]
        """ Input Layer """
        e = self.E[x]  # e: 1D: batch_size * l * 2, 2D: window, 3D: dim_emb
        x_in = e.reshape((batch_size * n_cands, l, -1))
        """ Intermediate Layer """
        # h: 1D: n_batch * n_cands, 2D: dim_emb
        h, params = cnn.layers(x_in, window, dim_emb, dim_hidden, n_layers,
                               activation)
        self.params.extend(params)
        """ Output Layer """
        h = h.reshape((batch_size, n_cands, -1))
        h_1 = h[T.arange(batch_size), 0]
        h_2 = h[T.arange(batch_size), 1:]
        if sim == 'cos':
            y_score = cosign_similarity(h_1, h_2)
        else:
            y_score = T.batched_dot(T.dot(h_1, self.W_out),
                                    h_2.dimshuffle(0, 2, 1))
        y_score_hat = T.max(y_score, 1)
        """ Objective Function """
        self.nll = max_margin_loss(y_score_hat, y_score[T.arange(batch_size),
                                                        y])
        self.L2_sqr = regularization(self.params)
        self.cost = self.nll + L2_reg * self.L2_sqr / 2.
        """ Optimization """
        if opt == 'adagrad':
            self.update = ada_grad(cost=self.cost, params=self.params, lr=lr)
        elif opt == 'ada_delta':
            self.update = ada_delta(cost=self.cost, params=self.params)
        elif opt == 'adam':
            self.update = adam(cost=self.cost, params=self.params, lr=lr)
        else:
            self.update = sgd(cost=self.cost, params=self.params, lr=lr)
        """ Predicts """
        y_hat = T.argmax(y_score, 1)
        """ Check Accuracies """
        self.correct = T.eq(y_hat, y)
Exemplo n.º 23
0
def main(method,LR_start):
	
	# BN parameters
	name = "mnist"
	print("dataset = "+str(name))
	print("Method = "+str(method))
	# alpha is the exponential moving average factor
	alpha = .1
	print("alpha = "+str(alpha))
	epsilon = 1e-4
	print("epsilon = "+str(epsilon))
	
	batch_size = 100
	print("batch_size = "+str(batch_size))

	num_epochs = 50
	print("num_epochs = "+str(num_epochs))

	# network structure
	num_units = 2048
	print("num_units = "+str(num_units))
	n_hidden_layers = 3
	print("n_hidden_layers = "+str(n_hidden_layers))

	print("LR_start = "+str(LR_start))
	LR_decay = 0.1
	print("LR_decay="+str(LR_decay))
	
	activation = lasagne.nonlinearities.rectify


	print('Loading MNIST dataset...')
	
	train_set = MNIST(which_set= 'train', start=0, stop = 50000, center = True)
	valid_set = MNIST(which_set= 'train', start=50000, stop = 60000, center = True)
	test_set = MNIST(which_set= 'test', center = True)
	
	# bc01 format
	train_set.X = train_set.X.reshape(-1, 1, 28, 28)
	valid_set.X = valid_set.X.reshape(-1, 1, 28, 28)
	test_set.X = test_set.X.reshape(-1, 1, 28, 28)
	
	# flatten targets
	train_set.y = np.hstack(train_set.y)
	valid_set.y = np.hstack(valid_set.y)
	test_set.y = np.hstack(test_set.y)
	
	# Onehot the targets
	train_set.y = np.float32(np.eye(10)[train_set.y])    
	valid_set.y = np.float32(np.eye(10)[valid_set.y])
	test_set.y = np.float32(np.eye(10)[test_set.y])
	
	# for hinge loss
	train_set.y = 2* train_set.y - 1.
	valid_set.y = 2* valid_set.y - 1.
	test_set.y = 2* test_set.y - 1.

	print('Building the MLP...') 
	
	# Prepare Theano variables for inputs and targets
	input = T.tensor4('inputs')
	target = T.matrix('targets')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	mlp = lasagne.layers.InputLayer(
			shape=(None, 1, 28, 28),
			input_var=input)
	
	for k in range(n_hidden_layers):
		mlp = laq.DenseLayer(
				mlp, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=num_units,
				method = method)                  	
		mlp = batch_norm.BatchNormLayer(
				mlp,
				epsilon=epsilon, 
				alpha=alpha)
		mlp = lasagne.layers.NonlinearityLayer(
				mlp,
				nonlinearity = activation)

	mlp = laq.DenseLayer(
				mlp, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=10,
				method = method)      
				  
	mlp = batch_norm.BatchNormLayer(
			mlp,
			epsilon=epsilon, 
			alpha=alpha)

	train_output = lasagne.layers.get_output(mlp, deterministic=False)
	# squared hinge loss
	loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
	

	if method!="FPN":
		
		# W updates
		W = lasagne.layers.get_all_params(mlp, quantized=True)
		W_grads = laq.compute_grads(loss,mlp)
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
		updates = laq.clipping_scaling(updates,mlp)
		
		# other parameters updates
		params = lasagne.layers.get_all_params(mlp, trainable=True, quantized=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, 
			learning_rate=LR, epsilon = 1e-8).items())


		## update the ternary matrix
		ternary_weights = laq.get_quantized_weights(loss, mlp)
		updates2 = OrderedDict()
		idx = 0
		tt_tag = lasagne.layers.get_all_params(mlp, tt=True)	
		for tt_tag_temp in tt_tag:
			updates2[tt_tag_temp]= ternary_weights[idx]
			idx = idx+1
		updates = OrderedDict(updates.items() + updates2.items())

		## update 2nd momentum
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(mlp, acc=True)	
		idx = 0
		beta2 = 0.999
		for acc_tag_temp in acc_tag:
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())

	else:
		params = lasagne.layers.get_all_params(mlp, trainable=True)
		updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR)

	test_output = lasagne.layers.get_output(mlp, deterministic=True)
		
	test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
	test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
	

	train_fn = theano.function([input, target, LR], loss, updates=updates)

	val_fn = theano.function([input, target], [test_loss, test_err])

	print('Training...')
	
	

	X_train = train_set.X
	y_train = train_set.y
	X_val = valid_set.X
	y_val = valid_set.y
	X_test = test_set.X
	y_test = test_set.y
	# This function trains the model a full epoch (on the whole dataset)
	def train_epoch(X,y,LR):
		
		loss = 0
		batches = len(X)/batch_size
		shuffled_range = range(len(X))
		np.random.shuffle(shuffled_range)
		for i in range(batches):
			tmp_ind = shuffled_range[i*batch_size:(i+1)*batch_size]  
			newloss = train_fn(X[tmp_ind],y[tmp_ind],LR) 
			loss +=newloss	

		loss/=batches		
		return loss
	
	# This function tests the model a full epoch (on the whole dataset)
	def val_epoch(X,y):
		
		err = 0
		loss = 0
		batches = len(X)/batch_size
		
		for i in range(batches):
			new_loss, new_err = val_fn(X[i*batch_size:(i+1)*batch_size], y[i*batch_size:(i+1)*batch_size])
			err += new_err
			loss += new_loss
		
		err = err / batches * 100
		loss /= batches

		return err, loss
	

	best_val_err = 100
	best_epoch = 1
	LR = LR_start
	# We iterate over epochs:
	for epoch in range(1, num_epochs+1):
		start_time = time.time()
		train_loss = train_epoch(X_train,y_train,LR)
		val_err, val_loss = val_epoch(X_val,y_val)

		# test if validation error went down
		if val_err <= best_val_err:
			best_val_err = val_err
			best_epoch = epoch
			test_err, test_loss = val_epoch(X_test,y_test)
			all_params = lasagne.layers.get_all_params(mlp)
			np.savez('{0}/{1}_lr{2}_{3}.npz'.format(method, name,  LR_start, method), *all_params)

		epoch_duration = time.time() - start_time
		
		# Then we print the results for this epoch:
		print("Epoch "+str(epoch)+" of "+str(num_epochs)+" took "+str(epoch_duration)+"s")
		print("  LR:                            "+str(LR))
		print("  training loss:                 "+str(train_loss))
		print("  validation loss:               "+str(val_loss))
		print("  validation error rate:         "+str(val_err)+"%")
		print("  best epoch:                    "+str(best_epoch))
		print("  best validation error rate:    "+str(best_val_err)+"%")
		print("  test loss:                     "+str(test_loss))
		print("  test error rate:               "+str(test_err)+"%") 
		

		with open("{0}/{1}_lr{2}_{3}.txt".format(method,name,  LR_start, method), "a") as myfile:
			myfile.write("{0}  {1:.5f} {2:.5f} {3:.5f} {4:.5f} {5:.5f} {6:.5f} {7:.5f}\n".format(epoch, 
				train_loss, val_loss, test_loss, val_err, test_err, epoch_duration, LR))

		# Learning rate update scheme
		if epoch == 15 or epoch==25:
			LR*=LR_decay
Exemplo n.º 24
0
def main(method,LR_start,Binarize_weight_only):
	
	name = "cifar"
	print("dataset = "+str(name))

	print("Binarize_weight_only="+str(Binarize_weight_only))

	print("Method = "+str(method))

	# alpha is the exponential moving average factor
	alpha = .1
	print("alpha = "+str(alpha))
	epsilon = 1e-4
	print("epsilon = "+str(epsilon))
	
	# Training parameters
	batch_size = 50
	print("batch_size = "+str(batch_size))
	
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	print("LR_start = "+str(LR_start))
	LR_decay = 0.5
	print("LR_decay="+str(LR_decay))

	if Binarize_weight_only =="w":
		activation = lasagne.nonlinearities.rectify
	else:
		activation = lab.binary_tanh_unit
	print("activation = "+ str(activation))
	

	train_set_size = 45000
	print("train_set_size = "+str(train_set_size))
	
	print('Loading CIFAR-10 dataset...')
	
	preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
	train_set = ZCA_Dataset(
		preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=0, stop = train_set_size)
	valid_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=45000, stop = 50000)  
	test_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), 
		preprocessor = preprocessor)
		
	# bc01 format
	train_set.X = train_set.X.reshape(-1,3,32,32)
	valid_set.X = valid_set.X.reshape(-1,3,32,32)
	test_set.X = test_set.X.reshape(-1,3,32,32)
	
	# flatten targets
	train_set.y = np.hstack(train_set.y)
	valid_set.y = np.hstack(valid_set.y)
	test_set.y = np.hstack(test_set.y)

   
	# Onehot the targets
	train_set.y = np.float32(np.eye(10)[train_set.y])    
	valid_set.y = np.float32(np.eye(10)[valid_set.y])
	test_set.y = np.float32(np.eye(10)[test_set.y])
	
	# for hinge loss
	train_set.y = 2* train_set.y - 1.
	valid_set.y = 2* valid_set.y - 1.
	test_set.y = 2* test_set.y - 1.

	print('Building the CNN...') 
	
	# Prepare Theano variables for inputs and targets
	input = T.tensor4('inputs')
	target = T.matrix('targets')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	l_in = lasagne.layers.InputLayer(
			shape=(None, 3, 32, 32),
			input_var=input)
	
	# 128C3-128C3-P2             
	l_cnn1 = lab.Conv2DLayer(
			l_in, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)

	l_bn1 = batch_norm.BatchNormLayer(
			l_cnn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl1 = lasagne.layers.NonlinearityLayer(
			l_bn1,
			nonlinearity = activation)

	l_cnn2 = lab.Conv2DLayer(
			l_nl1, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))
	
	l_bn2 = batch_norm.BatchNormLayer(
			l_mp1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl2 = lasagne.layers.NonlinearityLayer(
			l_bn2,
			nonlinearity = activation)			
	# 256C3-256C3-P2             
	l_cnn3 = lab.Conv2DLayer(
			l_nl2, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn3 = batch_norm.BatchNormLayer(
			l_cnn3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl3 = lasagne.layers.NonlinearityLayer(
			l_bn3,
			nonlinearity = activation)
			
	l_cnn4 = lab.Conv2DLayer(
			l_nl3, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))
	
	l_bn4 = batch_norm.BatchNormLayer(
			l_mp2,
			epsilon=epsilon, 
			alpha=alpha)
	
	l_nl4 = lasagne.layers.NonlinearityLayer(
			l_bn4,
			nonlinearity = activation)

	# 512C3-512C3-P2              
	l_cnn5 = lab.Conv2DLayer(
			l_nl4, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn5 = batch_norm.BatchNormLayer(
			l_cnn5,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl5 = lasagne.layers.NonlinearityLayer(
			l_bn5,
			nonlinearity = activation)
				  
	l_cnn6 = lab.Conv2DLayer(
			l_nl5, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))
	
	l_bn6 = batch_norm.BatchNormLayer(
			l_mp3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl6 = lasagne.layers.NonlinearityLayer(
			l_bn6,
			nonlinearity = activation)

	# print(cnn.output_shape)
	
	# 1024FP-1024FP-10FP            
	l_dn1 = lab.DenseLayer(
				l_nl6, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn7 = batch_norm.BatchNormLayer(
			l_dn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl7 = lasagne.layers.NonlinearityLayer(
			l_bn7,
			nonlinearity = activation)

	l_dn2 = lab.DenseLayer(
				l_nl7, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn8 = batch_norm.BatchNormLayer(
			l_dn2,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl8 = lasagne.layers.NonlinearityLayer(
			l_bn8,
			nonlinearity = activation)

	l_dn3 = lab.DenseLayer(
				l_nl8, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=10,
				method = method)      
				  
	l_out = batch_norm.BatchNormLayer(
			l_dn3,
			epsilon=epsilon, 
			alpha=alpha)

	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	
	# squared hinge loss
	loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
	
	if method!="FPN":
		# W updates
		W = lasagne.layers.get_all_params(l_out, binary=True)
		W_grads = lab.compute_grads(loss,l_out)
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
		updates = lab.clipping_scaling(updates,l_out)
		
		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())

		## update 2nd moment, can get from the adam optimizer also
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999   
		for acc_tag_temp in acc_tag:
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())	
	else:
		params = lasagne.layers.get_all_params(l_out, trainable=True)
		updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
	test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
	
	# Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 
	# and returning the corresponding training loss:
	train_fn = theano.function([input, target, LR], loss, updates=updates)
	val_fn = theano.function([input, target], [test_loss, test_err])

	print('Training...')
	
	lab.train(
			name, method,
			train_fn,val_fn,
			batch_size,
			LR_start,LR_decay,
			num_epochs,
			train_set.X,train_set.y,
			valid_set.X,valid_set.y,
			test_set.X,test_set.y)
Exemplo n.º 25
0
    def build_model(self, lr=0.001, dropout=None):
        def concatenate(tensor_list, axis=0):
            concat_size = sum(tt.shape[axis] for tt in tensor_list)
            output_shape = ()
            for k in range(axis):
                output_shape += (tensor_list[0].shape[k], )
            output_shape += (concat_size, )
            for k in range(axis + 1, tensor_list[0].ndim):
                output_shape += (tensor_list[0].shape[k], )
            out = tensor.zeros(output_shape)
            offset = 0
            for tt in tensor_list:
                indices = ()
                for k in range(axis):
                    indices += (slice(None), )
                indices += (slice(offset, offset + tt.shape[axis]), )
                for k in range(axis + 1, tensor_list[0].ndim):
                    indices += (slice(None), )

                out = tensor.set_subtensor(out[indices], tt)
                offset += tt.shape[axis]

            return out

        trng = RandomStreams(1234)
        use_noise = theano.shared(np.float32(0.))

        # description string: #words x #samples

        x = T.matrix('x', dtype='int32')  # step * samples
        x_mask = T.matrix('x_mask', dtype='float32')  # step * samples
        y = T.matrix('y', dtype='int32')  # sample * emb
        ctx = T.tensor3('ctx', dtype='float32')  # sample * annotation * dim

        n_timesteps = x.shape[0]
        n_samples = x.shape[1]

        xr = x[::-1]
        xr_mask = x_mask[::-1]

        emb = self.W_emb[x.flatten()]
        emb = emb.reshape([n_timesteps, n_samples, self.dim_word])

        embr = self.W_emb[xr.flatten()]
        embr = embr.reshape([n_timesteps, n_samples, self.dim_word])

        ctx0 = ctx
        ctx_mean = ctx0.mean(1)

        init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init

        # proj : gru hidden 들의 리스트
        proj = self.gru_layer(emb,
                              mask=x_mask,
                              context=ctx,
                              init_state=init_state)
        proj_h = proj[0]

        projr = self.gru_layer(embr,
                               mask=xr_mask,
                               context=ctx,
                               init_state=init_state)
        projr_h = projr[0]

        concat_proj_h = concatenate([proj_h, projr_h[::-1]],
                                    axis=proj_h.ndim - 1)
        # step_ctx : step * samples * (dim*2)
        concat_proj_h = (concat_proj_h *
                         x_mask[:, :, None]).sum(0) / x_mask.sum(0)[:, None]
        # step_ctx_mean : samples * (dim*2)

        if dropout is not None:
            concat_proj_h = dropout_layer(concat_proj_h, use_noise, trng,
                                          dropout)

        output = T.dot(concat_proj_h, self.W_pred) + self.b_pred

        probs = T.nnet.softmax(output)
        prediction = probs.argmax(axis=1)

        ## avoid NaN
        epsilon = 1.0e-9
        probs = T.clip(probs, epsilon, 1.0 - epsilon)
        probs /= probs.sum(axis=-1, keepdims=True)
        ## avoid NaN

        cost = T.nnet.categorical_crossentropy(probs, y)
        cost = T.mean(cost)

        updates = optimizer.adam(cost=cost, params=self.params, lr=lr)

        return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction