Exemplo n.º 1
0
def cmmva_6layer_svhn(learning_rate=0.01,
            n_epochs=600,
            dataset='svhngcn_var',
            batch_size=500,
            dropout_flag=1,
            seed=0,
            predir=None,
            activation=None,
            n_batch=625,
            weight_decay=1e-4,
            super_predir=None,
            super_preepoch=None):

    """
    Implementation of convolutional MMVA
    """    
    '''
    svhn
    '''
    n_channels = 3
    colorImg = True
    dim_w = 32
    dim_h = 32
    dim_input=(dim_h, dim_w)
    n_classes = 10

    D = 1.0
    C = 1.0
    if os.environ.has_key('C'):
        C = np.cast['float32'](float((os.environ['C'])))
    if os.environ.has_key('D'):
        D = np.cast['float32'](float((os.environ['D'])))
    color.printRed('D '+str(D)+' C '+str(C))
    
    first_drop=0.5
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop=1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1=96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2=96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    n_z=512
    if os.environ.has_key('n_z'):
        n_z = int(os.environ['n_z'])
    opt_med='adam'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    train_logvar=True
    if os.environ.has_key('train_logvar'):
        train_logvar = bool(int(os.environ['train_logvar']))
    std = 2e-2
    if os.environ.has_key('std'):
        std = os.environ['std']
    Loss_L = 1
    if os.environ.has_key('Loss_L'):
        Loss_L = int(os.environ['Loss_L'])
    pattern = 'hinge'
    if os.environ.has_key('pattern'):
        pattern = os.environ['pattern']


    #cp->cd->cpd->cd->c
    nkerns=[nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops=[0, 1, 1, 1, 0, 1]
    drop_p=[1, first_drop, first_drop, first_drop, 1, last_drop]
    n_hidden=[n_z]
    
    logdir = 'results/supervised/cmmva/svhn/cmmva_6layer_'+dataset+pattern+'_D_'+str(D)+'_C_'+str(C)+'_'#+str(nkerns)+str(n_hidden)+'_'+str(weight_decay)+'_'+str(learning_rate)+'_'
    #if predir is not None:
    #    logdir +='pre_'
    #if dropout_flag == 1:
    #    logdir += ('dropout_'+str(drops)+'_')
    #    logdir += ('drop_p_'+str(drop_p)+'_')
    #logdir += ('trainvar_'+str(train_logvar)+'_')
    #logdir += (opt_med+'_')
    #logdir += (str(Loss_L)+'_')
    #if super_predir is not None:
    #    logdir += (str(super_preepoch)+'_')
    logdir += str(int(time.time()))+'/'

    if not os.path.exists(logdir): os.makedirs(logdir)

    print 'logdir:', logdir, 'predir', predir
    print 'cmmva_6layer_svhn_fix', nkerns, n_hidden, seed, dropout_flag, drops, drop_p
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir, 'predir', predir
        print >>f, 'cmmva_6layer_svhn_fix', nkerns, n_hidden, seed, dropout_flag, drops, drop_p

    color.printRed('dataset '+dataset)

    datasets = datapy.load_data_svhn(dataset, have_matrix=True)
    train_set_x, train_set_y, train_y_matrix = datasets[0]
    test_set_x, test_set_y, test_y_matrix = datasets[1]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[2]

    #datasets = datapy.load_data_svhn(dataset, have_matrix=False)
    #train_set_x, train_set_y = datasets[0]
    #test_set_x, test_set_y = datasets[1]
    #valid_set_x, valid_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    random_z = T.matrix('random_z')
    y_matrix = T.imatrix('y_matrix')
    drop = T.iscalar('drop')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x.reshape((batch_size, n_channels, dim_h, dim_w))
    
    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, n_channels, dim_h, dim_w),
        filter_shape=(nkerns[0], n_channels, 5, 5),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share, p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))
    l+=[1, 2]
    d+=[1, 0]

    #2
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[0], 16, 16),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 0]
    
    #3
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[1], 16, 16),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 0]

    #4
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[2], 8, 8),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    l+=[1, 2]
    d+=[1, 0]

    #5
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[3], 8, 8),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    l+=[1, 2]
    d+=[1, 0]


    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []
    
    activations.append(mlp_input_x)

    classifier = Pegasos.Pegasos(
            input= activations[-1],
            rng=rng,
            n_in=nkerns[-1]*4*4,
            n_out=n_classes,
            weight_decay=0,
            loss=Loss_L,
            pattern=pattern
        )
    l+=[1, 2]
    d+=[1, 0]


    #stochastic layer
    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=mlp_input_x,
            n_in=4*4*nkerns[-1],
            n_out=n_hidden[0],
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 0]
    l+=[1, 2]
    d+=[1, 0]

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[-1],
            n_out=4*4*nkerns[-1],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))
    l+=[1, 2]
    d+=[1, 0]
    
    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 4, 4))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 4, 4))
    
    #1
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-1], 4, 4),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 0]
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-2], 8, 8),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 0]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-3], 8, 8),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 0]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    
    #4
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-4], 16, 16),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 0]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))


    #5-1 stochastic layer 
    # for this layer, the activation is None to get a Guassian mean
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-5], 16, 16),
            filter_shape=(n_channels, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='same', 
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 0]
    x_mean=gene_layer[-1].output(input=z_output[-1])
    random_x_mean=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)


    #5-2 stochastic layer 
    # for this layer, the activation is None to get logvar
    if train_logvar:
        gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
                rng,
                image_shape=(batch_size, nkerns[-5], 16, 16),
                filter_shape=(n_channels, nkerns[-5], 5, 5),
                poolsize=(2, 2),
                border_mode='same', 
                activation=None
            ))
        l+=[1, 2]
        d+=[1, 0]
        x_logvar=gene_layer[-1].output(input=z_output[-1])
        random_x_logvar=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)
    else:
        x_logvar = theano.shared(np.ones((batch_size, n_channels, dim_h, dim_w), dtype='float32'))
        random_x_logvar = theano.shared(np.ones((n_batch, n_channels, dim_h, dim_w), dtype='float32'))

    gene_layer.append(NoParamsGaussianVisiable.NoParamsGaussianVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=x_mean, logvar=x_logvar, data=input_x)
    random_x = gene_layer[-1].sample_x(rng_share=rng_share, mean=random_x_mean, logvar=random_x_logvar)

    #L = (logpx + logpz - logqz).sum()
    lowerbound = (
        (logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).mean()
    )
    hinge_loss = classifier.hinge_loss(10, y, y_matrix)
    
    cost = D * lowerbound - C * hinge_loss

    px = (logpx.mean())
    pz = (recg_layer[-1].logpz.mean())
    qz = (- recg_layer[-1].logqz.mean())

    super_params=[]
    for r in recg_layer[:-1]:
        super_params+=r.params
    super_params+=classifier.params

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params
    params+=classifier.params
    grads = [T.grad(cost, param) for param in params]

    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    if opt_med=='adam':
        get_optimizer = optimizer_separated.get_adam_optimizer_max(learning_rate=l_r, decay1 = 0.1, decay2 = 0.001, weight_decay=weight_decay)
    elif opt_med=='mom':
        get_optimizer = optimizer_separated.get_momentum_optimizer_max(learning_rate=l_r, weight_decay=weight_decay)
    updates = get_optimizer(w=params,g=grads, l=l, d=d)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    valid_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    

    valid_error = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )




    '''
    Save parameters and activations
    '''

    pog = []
    for (p,g) in zip(params, grads):
        pog.append(p.max())
        pog.append((p**2).mean())
        pog.append((g**2).mean())
        pog.append((T.sqrt(pog[-2] / pog[-1]))/ 1e3)

    paramovergrad = theano.function(
        inputs=[index],
        outputs=pog,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        }
    )

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    generation_check = theano.function(
        inputs=[index],
        outputs=[x, x_mean.flatten(2), x_logvar.flatten(2)],
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, px, pz, qz, hinge_loss, cost],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        }
    )


    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2), random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0)
        }
    )

    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost, px, pz, qz, z],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
        }
    )

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'model.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        tmp =  [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0)
        print '------------------', tmp

    if super_predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(super_predir)
        pre_train = np.load(super_predir+'svhn_model-'+str(super_preepoch)+'.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(super_params, pre_train):
            para.set_value(pre)
        this_test_losses = [test_model(i) for i in xrange(n_test_batches)]
        this_test_score = np.mean(this_test_losses, axis=0)
        #print predir
        print 'preepoch', super_preepoch, 'pre_test_score', this_test_score
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, predir
            print >>f, 'preepoch', super_preepoch, 'pre_test_score', this_test_score


    ###############
    # TRAIN MODEL #
    ###############

    print '... training'
    validation_frequency = n_train_batches

    predy_valid_stats = [1, 1, 0]
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    generatition_frequency = 1
    if predir is not None:
        threshold = 0
    color.printRed('threshold, '+str(threshold) + 
        ' generatition_frequency, '+str(generatition_frequency)
        +' validation_frequency, '+str(validation_frequency))
    done_looping = False
    n_epochs = 80
    decay_epochs = 40
    record = 0

    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        minibatch_avg_cost = 0
        train_error = 0
        train_lowerbound = 0
        train_hinge_loss = 0
        _____z = 0
        pxx = 0
        pzz = 0
        qzz = 0
        preW = None
        currentW = None
        
        tmp_start1 = time.clock()
        if epoch == 30:
            validation_frequency = n_train_batches/5
        if epoch == 50:
            validation_frequency = n_train_batches/10

        if epoch == 30 or epoch == 50 or epoch == 70 or epoch == 90:
            record = epoch
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))
            print '---------', epoch, l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,'---------', epoch, l_r.get_value()
        '''
        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 5 == 0:
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))
            print '---------------', l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------', l_r.get_value()
        '''

        for minibatch_index in xrange(n_train_batches):            
            e, l, h, ttt, tpx, tpz, tqz, _z = train_model(minibatch_index)
            pxx+=tpx
            pzz+=tpz
            qzz+=tqz
            #_____z += (np.asarray(_z)**2).sum() / (n_hidden[-1] * batch_size)
            train_error += e
            train_lowerbound += l
            train_hinge_loss += h
            minibatch_avg_cost += ttt
            
            '''
            llll = debug_model(minibatch_index)
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,'[]', llll
            '''
            if math.isnan(ttt):
                color.printRed('--------'+str(epoch)+'--------'+str(minibatch_index))
                exit()
            

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            '''
            if (minibatch_index <11):
                preW = currentW
                currentW = parameters()
                for i in xrange(len(currentW)):
                    currentW[i] = np.asarray(currentW[i]).astype(np.float32)

                if preW is not None:
                    for (c,p) in zip(currentW, preW):
                        #print minibatch_index, (c**2).mean(), ((c-p)**2).mean(), np.sqrt((c**2).mean()/((c-p)**2).mean())
                        with open(logdir+'delta_w.txt', 'a') as f:
                            print >>f,minibatch_index, (c**2).mean(), ((c-p)**2).mean(), np.sqrt((c**2).mean()/((c-p)**2).mean())
            ''' 
            # check valid error only, to speed up
            '''
            if (iter + 1) % validation_frequency != 0 and (iter + 1) %(validation_frequency/10) == 0:
                vt = [valid_error(i) for i in xrange(n_valid_batches)]
                vt = np.mean(vt)
                print 'quick valid error', vt
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, 'quick valid error', vt
                print 'So far best model', predy_valid_stats
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, 'So far best model', predy_valid_stats
            '''
            

            if (iter + 1) % validation_frequency == 0:
                print minibatch_index, 'stochastic training error', train_error/float(minibatch_index), train_lowerbound/float(minibatch_index), train_hinge_loss/float(minibatch_index), minibatch_avg_cost /float(minibatch_index), pxx/float(minibatch_index), pzz/float(minibatch_index), qzz/float(minibatch_index)#, 'z_norm', _____z/float(minibatch_index)
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, minibatch_index, 'stochastic training error', train_error/float(minibatch_index), train_lowerbound/float(minibatch_index), train_hinge_loss/float(minibatch_index), minibatch_avg_cost /float(minibatch_index), pxx/float(minibatch_index), pzz/float(minibatch_index), qzz/float(minibatch_index)#, 'z_norm', _____z/float(minibatch_index)
                
                valid_stats = [valid_model(i) for i in xrange(n_valid_batches)]
                this_valid_stats = np.mean(valid_stats, axis=0)

                print epoch, minibatch_index, 'validation stats', this_valid_stats
                #print tmp
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, epoch, minibatch_index, 'validation stats', this_valid_stats
                print 'So far best model', predy_valid_stats
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, 'So far best model', predy_valid_stats

                if this_valid_stats[0] < predy_valid_stats[0]:
                    test_stats = [test_model(i) for i in xrange(n_test_batches)]
                    this_test_stats = np.mean(test_stats, axis=0)
                    predy_valid_stats[0] = this_valid_stats[0]
                    predy_valid_stats[1] = this_test_stats[0]
                    predy_valid_stats[2] = epoch
                    record = epoch
                    print 'Update best model', this_test_stats
                    with open(logdir+'hook.txt', 'a') as f:
                        print >>f,'Update best model', this_test_stats
                    model = parameters()
                    for i in xrange(len(model)):
                        model[i] = np.asarray(model[i]).astype(np.float32)
                        #print model[i].shape, np.mean(model[i]), np.var(model[i])
                    np.savez(logdir+'best-model', model=model)

        genezero = generation_check(0)
        with open(logdir+'gene_check.txt', 'a') as f:
            print >>f, 'epoch-----------------------', epoch
            print >>f, 'x', 'x_mean', 'x_logvar'
        '''
        for i in xrange(len(genezero)):
            genezero[i] = np.asarray(genezero[i])
            with open(logdir+'gene_check.txt', 'a') as f:
                print >>f, genezero[i].max(), genezero[i].min(), genezero[i].mean()
        with open(logdir+'gene_check.txt', 'a') as f:
            print >>f, 'norm', np.sqrt(((genezero[0]- genezero[1])**2).sum())
        '''
        if epoch==1:
            xxx = genezero[0]
            image = paramgraphics.mat_to_img(xxx.T, dim_input, colorImg=colorImg, scale=True)
            image.save(logdir+'data.png', 'PNG')
        if epoch%1==0:
            tail='-'+str(epoch)+'.png'
            xxx_now = genezero[1]
            image = paramgraphics.mat_to_img(xxx_now.T, dim_input, colorImg=colorImg, scale=True)
            image.save(logdir+'data_re'+tail, 'PNG')
        
        if math.isnan(minibatch_avg_cost):
            NaN_count+=1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------NaN_count:', NaN_count
            
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0)
            print '------------------NaN check:', tmp
            with open(logdir+'hook.txt', 'a') as f:
               print >>f, '------------------NaN check:', tmp
               
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, model[i].shape, np.mean(model[i]), np.var(model[i])
                    print >>f, np.max(model[i]), np.min(model[i])
                    print >>f, np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))

            best_before = np.load(logdir+'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0)
            print '------------------', tmp
            return
            
        if epoch%1==0:    
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez(logdir+'model-'+str(epoch), model=model)
        
        tmp_start4=time.clock()

        if epoch % generatition_frequency == 0:
            tail='-'+str(epoch)+'.png'
            random_z = np.random.standard_normal((n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T, dim_input, colorImg=colorImg, scale=True)
            image.save(logdir+'samples'+tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg, scale=True)
            image.save(logdir+'mean_samples'+tail, 'PNG')
            
        #print 'generation_time', time.clock() - tmp_start4
        #print 'one epoch time', time.clock() - tmp_start1

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, '---------------NaN_count:', NaN_count
Exemplo n.º 2
0
def cva_6layer_dropout_mnist_60000(seed=0,
                                   dropout_flag=1,
                                   drop_inverses_flag=0,
                                   learning_rate=3e-4,
                                   predir=None,
                                   n_batch=144,
                                   dataset='mnist.pkl.gz',
                                   batch_size=500,
                                   nkerns=[20, 50],
                                   n_hidden=[500, 50]):
    """
    Implementation of convolutional VA
    """
    #cp->cd->cpd->cd->c
    nkerns = [32, 32, 64, 64, 64]
    drops = [1, 0, 1, 0, 0]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden = [500, 50]
    drop_inverses = [
        1,
    ]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28

    if dataset == 'mnist.pkl.gz':
        dim_input = (28, 28)
        colorImg = False

    logdir = 'results/supervised/cva/mnist/cva_6layer_mnist_60000' + str(
        nkerns) + str(n_hidden) + '_' + str(learning_rate) + '_'
    if predir is not None:
        logdir += 'pre_'
    if dropout_flag == 1:
        logdir += ('dropout_' + str(drops) + '_')
    if drop_inverses_flag == 1:
        logdir += ('inversedropout_' + str(drop_inverses) + '_')
    logdir += str(int(time.time())) + '/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'logdir:', logdir, 'predir', predir
        print >> f, 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')

    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))

    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x,
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #3
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[1], 12, 12),
                                filter_shape=(nkerns[2], nkerns[1], 3, 3),
                                poolsize=(2, 2),
                                border_mode='valid',
                                activation=activation))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[2], 5, 5),
                                filter_shape=(nkerns[3], nkerns[2], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(
        ConvMaxPool.ConvMaxPool(rng,
                                image_shape=(batch_size, nkerns[3], 5, 5),
                                filter_shape=(nkerns[4], nkerns[3], 3, 3),
                                poolsize=(1, 1),
                                border_mode='same',
                                activation=activation))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=5 * 5 * nkerns[-1],
                                      n_out=n_hidden[0],
                                      activation=activation))
    if drops[-1] == 1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x,
                                                      drop=drop,
                                                      rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))

    #stochastic layer
    recg_layer.append(
        GaussianHidden.GaussianHidden(rng=rng,
                                      input=activations[-1],
                                      n_in=n_hidden[0],
                                      n_out=n_hidden[1],
                                      activation=None))

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[1],
                                      n_out=n_hidden[0],
                                      activation=activation))

    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[0],
                                      n_out=5 * 5 * nkerns[-1],
                                      activation=activation))

    if drop_inverses[0] == 1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1],
                                                   drop=drop_inverse,
                                                   rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(
            input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(
            gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-1], 5, 5),
                                    filter_shape=(nkerns[-2], nkerns[-1], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=input_random_z, n_batch=n_batch))

    #2
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-2], 5, 5),
                                    filter_shape=(nkerns[-3], nkerns[-2], 3,
                                                  3),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-3], 12,
                                                 12),
                                    filter_shape=(nkerns[-4], nkerns[-3], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-4], 12,
                                                 12),
                                    filter_shape=(nkerns[-5], nkerns[-4], 3,
                                                  3),
                                    poolsize=(1, 1),
                                    border_mode='same',
                                    activation=activation))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(
        UnpoolConvNon.UnpoolConvNon(rng,
                                    image_shape=(batch_size, nkerns[-5], 12,
                                                 12),
                                    filter_shape=(1, nkerns[-5], 5, 5),
                                    poolsize=(2, 2),
                                    border_mode='full',
                                    activation=nonlinearity.sigmoid))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    gene_layer.append(
        NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)

    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    cost = ((logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum())

    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (-recg_layer[-1].logqz.sum())

    params = []
    for g in gene_layer:
        params += g.params
    for r in recg_layer:
        params += r.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay = 1.0 / n_train_batches
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r,
                                                     decay1=0.1,
                                                     decay2=0.001,
                                                     weight_decay=weight_decay,
                                                     epsilon=1e-8)
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'AdaM', learning_rate, weight_decay
    updates = get_optimizer(params, gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            #y: test_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    validate_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            #y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        })

    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        })

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        })

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[cost, px, pz, qz],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        })

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2),
                 random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=cost,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        })

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        })

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir + 'model.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        tmp = [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, ' + str(threshold) +
                   ' generatition_frequency, ' + str(generatition_frequency) +
                   ' validation_frequency, ' + str(validation_frequency))
    done_looping = False
    n_epochs = 600
    decay_epochs = 500
    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        minibatch_avg_cost = 0

        tmp_start1 = time.clock()

        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value() / 3.0))

        for minibatch_index in xrange(n_train_batches):
            #print minibatch_index
            '''
            color.printRed('lalala')
            xxx = dims(minibatch_index)
            print xxx.shape
            '''
            #print n_train_batches
            minibatch_avg_cost += train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

        if math.isnan(minibatch_avg_cost):
            NaN_count += 1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, '---------------NaN_count:', NaN_count

            tmp = [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------NaN check:', tmp
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, '------------------NaN check:', tmp

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, model[i].shape, np.mean(model[i]), np.var(
                        model[i])
                    print >> f, np.max(model[i]), np.min(model[i])
                    print >> f, np.all(np.isfinite(model[i])), np.any(
                        np.isnan(model[i]))

            best_before = np.load(logdir + 'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp = [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------', tmp
            return

        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', minibatch_avg_cost / float(
            n_train_batches * batch_size)
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, epoch, 'stochastic training error', minibatch_avg_cost / float(
                n_train_batches * batch_size)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()

            test_losses = [test_model(i) for i in xrange(n_test_batches)]
            this_test_bound = np.mean(test_losses) / float(batch_size)

            #tmp =  [debug_model(i) for i
            #                     in xrange(n_train_batches)]
            #tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)

            print epoch, 'test bound', this_test_bound
            #print tmp
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, epoch, 'test bound', this_test_bound

        if epoch % 100 == 0:

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez(logdir + 'model-' + str(epoch), model=model)

            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack(
                        (train_features, np.asarray(train_activations(i))))

            for i in xrange(n_valid_batches):
                if i == 0:
                    valid_features = np.asarray(valid_activations(i))
                else:
                    valid_features = np.vstack(
                        (valid_features, np.asarray(valid_activations(i))))

            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack(
                        (test_features, np.asarray(test_activations(i))))
            np.save(logdir + 'train_features', train_features)
            np.save(logdir + 'valid_features', valid_features)
            np.save(logdir + 'test_features', test_features)

        tmp_start4 = time.clock()
        if epoch % generatition_frequency == 0:
            tail = '-' + str(epoch) + '.png'
            random_z = np.random.standard_normal(
                (n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T,
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'samples' + tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T,
                                             dim_input,
                                             colorImg=colorImg)
            image.save(logdir + 'mean_samples' + tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, '---------------NaN_count:', NaN_count
Exemplo n.º 3
0
def c_6layer_svhn_features(learning_rate=0.01,
                           n_epochs=600,
                           dataset='svhngcn_var',
                           batch_size=1000,
                           dropout_flag=1,
                           seed=0,
                           predir=None,
                           activation=None,
                           n_batch=625,
                           weight_decay=1e-4,
                           super_predir=None,
                           super_preepoch=None):
    """
    Missing data imputation
    """
    '''
    svhn
    '''
    n_channels = 3
    colorImg = True
    dim_w = 32
    dim_h = 32
    dim_input = (dim_h, dim_w)
    n_classes = 10

    first_drop = 0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop = 1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1 = 96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2 = 96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med = 'mom'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    train_logvar = True
    if os.environ.has_key('train_logvar'):
        train_logvar = bool(int(os.environ['train_logvar']))
    dataset = 'svhnlcn'
    if os.environ.has_key('dataset'):
        dataset = os.environ['dataset']
    n_z = 256
    if os.environ.has_key('n_z'):
        n_z = int(os.environ['n_z'])

    #cp->cd->cpd->cd->c
    nkerns = [nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops = [0, 1, 1, 1, 0, 1]
    drop_p = [1, first_drop, first_drop, first_drop, 1, last_drop]
    n_hidden = [n_z]

    logdir = 'results/supervised/cva/svhn_features/cva_6layer_svhn'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir

    color.printRed('dataset ' + dataset)

    datasets = datapy.load_data_svhn(dataset, have_matrix=False)
    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]
    valid_set_x, valid_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    random_z = T.matrix('random_z')

    p_label = T.matrix('p_label')

    drop = T.iscalar('drop')

    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x.reshape((batch_size, n_channels, dim_h, dim_w))

    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, n_channels, dim_h, dim_w),
            filter_shape=(nkerns[0], n_channels, 5, 5),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x,
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))
    l += [1, 2]
    d += [1, 1]

    #2
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[0], 16, 16),
            filter_shape=(nkerns[1], nkerns[0], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    #3
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[1], 16, 16),
            filter_shape=(nkerns[2], nkerns[1], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    #4
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[2], 8, 8),
            filter_shape=(nkerns[3], nkerns[2], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    l += [1, 2]
    d += [1, 1]

    #5
    '''
    --------------------- (2,2) or (4,4)
    '''
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[3], 8, 8),
            filter_shape=(nkerns[4], nkerns[3], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []
    activations.append(mlp_input_x)
    #1
    '''
    ---------------------No MLP
    '''
    '''
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share, p=drop_p[-1]))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    '''

    #stochastic layer
    recg_layer.append(
        GaussianHidden.GaussianHidden(rng=rng,
                                      input=activations[-1],
                                      n_in=4 * 4 * nkerns[-1],
                                      n_out=n_hidden[0],
                                      activation=None))
    l += [1, 2]
    d += [1, 1]
    l += [1, 2]
    d += [1, 1]

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[0],
                                      n_out=4 * 4 * nkerns[-1],
                                      activation=activation))

    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))
    l += [1, 2]
    d += [1, 1]

    #2
    '''
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))
    '''

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 4, 4))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 4, 4))

    #1
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-1], 4, 4),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=input_random_z, n_batch=n_batch))

    #2
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-2], 8, 8),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-3], 8, 8),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-4], 16, 16),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #5-1 stochastic layer
    # for this layer, the activation is None to get a Guassian mean
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-5], 16, 16),
            filter_shape=(n_channels, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='same',
            activation=None))
    l += [1, 2]
    d += [1, 1]
    x_mean = gene_layer[-1].output(input=z_output[-1])
    random_x_mean = gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch)

    #5-2 stochastic layer
    # for this layer, the activation is None to get logvar
    if train_logvar:
        gene_layer.append(
            UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
                rng,
                image_shape=(batch_size, nkerns[-5], 16, 16),
                filter_shape=(n_channels, nkerns[-5], 5, 5),
                poolsize=(2, 2),
                border_mode='same',
                activation=None))
        l += [1, 2]
        d += [1, 1]
        x_logvar = gene_layer[-1].output(input=z_output[-1])
        random_x_logvar = gene_layer[-1].output_random_generation(
            input=random_z_output[-1], n_batch=n_batch)
    else:
        x_logvar = theano.shared(
            np.ones((batch_size, n_channels, dim_h, dim_w), dtype='float32'))
        random_x_logvar = theano.shared(
            np.ones((n_batch, n_channels, dim_h, dim_w), dtype='float32'))

    gene_layer.append(
        NoParamsGaussianVisiable.NoParamsGaussianVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=x_mean, logvar=x_logvar, data=input_x)
    random_x = gene_layer[-1].sample_x(rng_share=rng_share,
                                       mean=random_x_mean,
                                       logvar=random_x_logvar)

    params = []
    for g in gene_layer:
        params += g.params
    for r in recg_layer:
        params += r.params

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        })

    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        })

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        })

    ##################
    # Pretrain MODEL #
    ##################
    model_epoch = 100
    ctype = 'cva'
    if os.environ.has_key('model_epoch'):
        model_epoch = int(os.environ['model_epoch'])
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        if model_epoch == -1:
            pre_train = np.load(predir + 'best-model.npz')
        else:
            pre_train = np.load(predir + 'model-' + str(model_epoch) + '.npz')
        pre_train = pre_train['model']
        if ctype == 'cva':
            for (para, pre) in zip(params, pre_train):
                para.set_value(pre)
        elif ctype == 'cmmva':
            for (para, pre) in zip(params, pre_train[:-2]):
                para.set_value(pre)
        else:
            exit()
    else:
        exit()

    ###############
    # TRAIN MODEL #
    ###############
    print 'extract features: valid'
    for i in xrange(n_valid_batches):
        if i == 0:
            valid_features = np.asarray(valid_activations(i))
        else:
            valid_features = np.vstack(
                (valid_features, np.asarray(valid_activations(i))))
    #print 'valid'
    print 'extract features: test'
    for i in xrange(n_test_batches):
        if i == 0:
            test_features = np.asarray(test_activations(i))
        else:
            test_features = np.vstack(
                (test_features, np.asarray(test_activations(i))))

    f = file(logdir + "svhn_features.bin", "wb")
    np.save(f, valid_features)
    np.save(f, test_features)
    f.close()
    #print 'test'

    print 'extract features: train'
    f = file(logdir + "svhn_train_features.bin", "wb")
    for i in xrange(n_train_batches):
        #print n_train_batches
        #print i
        train_features = np.asarray(train_activations(i))
        np.save(f, train_features)
    f.close()
def c_6layer_svhn_imputation(seed=0, ctype='cva',
             pertub_type=5, pertub_prob=0, pertub_prob1=16, visualization_times=20,
             denoise_times=200, predir=None, n_batch=900, batch_size=500):

    """
    Missing data imputation
    """      
    '''
    svhn
    '''
    n_channels = 3
    colorImg = True
    dim_w = 32
    dim_h = 32
    dim_input=(dim_h, dim_w)
    n_classes = 10

    first_drop=0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop=1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1=96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2=96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med='mom'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    train_logvar=True
    if os.environ.has_key('train_logvar'):
        train_logvar = bool(int(os.environ['train_logvar']))
    dataset='svhnlcn'
    if os.environ.has_key('dataset'):
        dataset = os.environ['dataset']
    n_z=256
    if os.environ.has_key('n_z'):
        n_z = int(os.environ['n_z'])

    #cp->cd->cpd->cd->c
    nkerns=[nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops=[0, 1, 1, 1, 0, 1]
    drop_p=[1, first_drop, first_drop, first_drop, 1, last_drop]
    n_hidden=[n_z]
    
    logdir = 'results/imputation/'+ctype+'/svhn/'+ctype+'_6layer_'+dataset+'_'
    logdir += str(int(time.time()))+'/'
    if not os.path.exists(logdir): os.makedirs(logdir)

    print predir
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, predir

    color.printRed('dataset '+dataset)

    test_set_x, test_set_x_pertub, pertub_label, pertub_number = datapy.load_pertub_data_svhn(dirs='data_imputation/', dataset=dataset, pertub_type=pertub_type, pertub_prob=pertub_prob, pertub_prob1=pertub_prob1)
    pixel_max, pixel_min = datapy.load_max_min(dirs='data_imputation/', dataset=dataset, pertub_prob=pertub_prob)
    # compute number of minibatches for training, validation and testing
    #n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    random_z = T.matrix('random_z')

    x_pertub = T.matrix('x_pertub')  # the data is presented as rasterized images
    p_label = T.matrix('p_label')

    drop = T.iscalar('drop')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x_pertub.reshape((batch_size, n_channels, dim_h, dim_w))
    
    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, n_channels, dim_h, dim_w),
        filter_shape=(nkerns[0], n_channels, 5, 5),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share, p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))
    l+=[1, 2]
    d+=[1, 1]

    #2
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[0], 16, 16),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]
    
    #3
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[1], 16, 16),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]

    #4
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[2], 8, 8),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    l+=[1, 2]
    d+=[1, 1]

    #5
    '''
    --------------------- (2,2) or (4,4)
    '''
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[3], 8, 8),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []
    activations.append(mlp_input_x)
    #1
    '''
    ---------------------No MLP
    '''
    '''
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share, p=drop_p[-1]))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    '''

    #stochastic layer
    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 1]
    l+=[1, 2]
    d+=[1, 1]

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))
    l+=[1, 2]
    d+=[1, 1]

    #2
    '''
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))
    '''

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 4, 4))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 4, 4))

    #1
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-1], 4, 4),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-2], 8, 8),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-3], 8, 8),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-4], 16, 16),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5-1 stochastic layer 
    # for this layer, the activation is None to get a Guassian mean
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-5], 16, 16),
            filter_shape=(n_channels, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='same', 
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 1]
    x_mean=gene_layer[-1].output(input=z_output[-1])
    random_x_mean=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)


    #5-2 stochastic layer 
    # for this layer, the activation is None to get logvar
    if train_logvar:
        gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
                rng,
                image_shape=(batch_size, nkerns[-5], 16, 16),
                filter_shape=(n_channels, nkerns[-5], 5, 5),
                poolsize=(2, 2),
                border_mode='same', 
                activation=None
            ))
        l+=[1, 2]
        d+=[1, 1]
        x_logvar=gene_layer[-1].output(input=z_output[-1])
        random_x_logvar=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)
    else:
        x_logvar = theano.shared(np.ones((batch_size, n_channels, dim_h, dim_w), dtype='float32'))
        random_x_logvar = theano.shared(np.ones((n_batch, n_channels, dim_h, dim_w), dtype='float32'))

    gene_layer.append(NoParamsGaussianVisiable.NoParamsGaussianVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=x_mean, logvar=x_logvar, data=input_x)
    random_x = gene_layer[-1].sample_x(rng_share=rng_share, mean=random_x_mean, logvar=random_x_logvar)

    x_denoised = p_label*x+(1-p_label)*x_mean.flatten(2)
    mse = ((x - x_denoised)**2).sum() / pertub_number

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params

    '''
    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    '''
    '''
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    '''
    test_activations = theano.function(
        inputs=[x_pertub],
        outputs=T.concatenate(activations, axis=1),
        givens={
            drop: np.cast['int32'](0)
        }
    )

    imputation_model = theano.function(
        inputs=[index, x_pertub],
        outputs=[x_denoised, mse],
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            p_label:pertub_label[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
        }
    )

    ##################
    # Pretrain MODEL #
    ##################
    model_epoch = 100
    if os.environ.has_key('model_epoch'):
        model_epoch = int(os.environ['model_epoch'])
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        if model_epoch == -1:
            pre_train = np.load(predir+'best-model.npz')
        else:
            pre_train = np.load(predir+'model-'+str(model_epoch)+'.npz')
        pre_train = pre_train['model']
        if ctype == 'cva':
            for (para, pre) in zip(params, pre_train):
                para.set_value(pre)
        elif ctype == 'cmmva':
            for (para, pre) in zip(params, pre_train[:-2]):
                para.set_value(pre)
        else:
            exit()
    else:
        exit()

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    scale = False
    epoch = 0
    n_visualization = 900
    pixel_max = pixel_max[:n_visualization]
    pixel_min = pixel_min[:n_visualization]
    output = np.ones((n_visualization, visualization_times+2, n_channels*dim_input[0]*dim_input[1]))
    output[:,0,:] = test_set_x.get_value()[:n_visualization,:]
    output[:,1,:] = test_set_x_pertub.get_value()[:n_visualization,:]
    
    image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(output[:,0,:].T,pixel_max,pixel_min), dim_input, colorImg=colorImg, scale=scale)
    image.save(logdir+'data.png', 'PNG')
    image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(output[:,1,:].T,pixel_max,pixel_min), dim_input, colorImg=colorImg, scale=scale)
    image.save(logdir+'data_pertub.png', 'PNG')

    tmp = test_set_x_pertub.get_value()

    while epoch < denoise_times:
        epoch = epoch + 1
        for i in xrange(n_test_batches):
            d, m = imputation_model(i, tmp[i * batch_size: (i + 1) * batch_size])
            tmp[i * batch_size: (i + 1) * batch_size] = np.asarray(d)
        if epoch<=visualization_times:
            output[:,epoch+1,:] = tmp[:n_visualization,:]

        image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(tmp[:n_visualization,:].T,pixel_max,pixel_min), dim_input, colorImg=colorImg, scale=scale)
        image.save(logdir+'procedure-'+str(epoch)+'.png', 'PNG')
        np.savez(logdir+'procedure-'+str(epoch), tmp=tmp)

    '''
    image = paramgraphics.mat_to_img((output.reshape(-1,32*32*3)).T, dim_input, colorImg=colorImg, tile_shape=(n_visualization,22), scale=scale)
    image.save(logdir+'output.png', 'PNG')
    np.savez(logdir+'output', output=output)
    '''

    '''
def c_6layer_svhn_imputation(seed=0,
                             ctype='cva',
                             pertub_type=5,
                             pertub_prob=0,
                             pertub_prob1=16,
                             visualization_times=20,
                             denoise_times=200,
                             predir=None,
                             n_batch=900,
                             batch_size=500):
    """
    Missing data imputation
    """
    '''
    svhn
    '''
    n_channels = 3
    colorImg = True
    dim_w = 32
    dim_h = 32
    dim_input = (dim_h, dim_w)
    n_classes = 10

    first_drop = 0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop = 1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1 = 96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2 = 96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med = 'mom'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    train_logvar = True
    if os.environ.has_key('train_logvar'):
        train_logvar = bool(int(os.environ['train_logvar']))
    dataset = 'svhnlcn'
    if os.environ.has_key('dataset'):
        dataset = os.environ['dataset']
    n_z = 256
    if os.environ.has_key('n_z'):
        n_z = int(os.environ['n_z'])

    #cp->cd->cpd->cd->c
    nkerns = [nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops = [0, 1, 1, 1, 0, 1]
    drop_p = [1, first_drop, first_drop, first_drop, 1, last_drop]
    n_hidden = [n_z]

    logdir = 'results/imputation/' + ctype + '/svhn/' + ctype + '_6layer_' + dataset + '_'
    logdir += str(int(time.time())) + '/'
    if not os.path.exists(logdir): os.makedirs(logdir)

    print predir
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, predir

    color.printRed('dataset ' + dataset)

    test_set_x, test_set_x_pertub, pertub_label, pertub_number = datapy.load_pertub_data_svhn(
        dirs='data_imputation/',
        dataset=dataset,
        pertub_type=pertub_type,
        pertub_prob=pertub_prob,
        pertub_prob1=pertub_prob1)
    pixel_max, pixel_min = datapy.load_max_min(dirs='data_imputation/',
                                               dataset=dataset,
                                               pertub_prob=pertub_prob)
    # compute number of minibatches for training, validation and testing
    #n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    random_z = T.matrix('random_z')

    x_pertub = T.matrix(
        'x_pertub')  # the data is presented as rasterized images
    p_label = T.matrix('p_label')

    drop = T.iscalar('drop')

    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x_pertub.reshape((batch_size, n_channels, dim_h, dim_w))

    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, n_channels, dim_h, dim_w),
            filter_shape=(nkerns[0], n_channels, 5, 5),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x,
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))
    l += [1, 2]
    d += [1, 1]

    #2
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[0], 16, 16),
            filter_shape=(nkerns[1], nkerns[0], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    #3
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[1], 16, 16),
            filter_shape=(nkerns[2], nkerns[1], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    #4
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[2], 8, 8),
            filter_shape=(nkerns[3], nkerns[2], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    l += [1, 2]
    d += [1, 1]

    #5
    '''
    --------------------- (2,2) or (4,4)
    '''
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[3], 8, 8),
            filter_shape=(nkerns[4], nkerns[3], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 1]

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []
    activations.append(mlp_input_x)
    #1
    '''
    ---------------------No MLP
    '''
    '''
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share, p=drop_p[-1]))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    '''

    #stochastic layer
    recg_layer.append(
        GaussianHidden.GaussianHidden(rng=rng,
                                      input=activations[-1],
                                      n_in=4 * 4 * nkerns[-1],
                                      n_out=n_hidden[0],
                                      activation=None))
    l += [1, 2]
    d += [1, 1]
    l += [1, 2]
    d += [1, 1]

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(
        FullyConnected.FullyConnected(rng=rng,
                                      n_in=n_hidden[0],
                                      n_out=4 * 4 * nkerns[-1],
                                      activation=activation))

    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))
    l += [1, 2]
    d += [1, 1]

    #2
    '''
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))
    '''

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 4, 4))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 4, 4))

    #1
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-1], 4, 4),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=input_random_z, n_batch=n_batch))

    #2
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-2], 8, 8),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-3], 8, 8),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-4], 16, 16),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation))
    l += [1, 2]
    d += [1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch))

    #5-1 stochastic layer
    # for this layer, the activation is None to get a Guassian mean
    gene_layer.append(
        UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-5], 16, 16),
            filter_shape=(n_channels, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='same',
            activation=None))
    l += [1, 2]
    d += [1, 1]
    x_mean = gene_layer[-1].output(input=z_output[-1])
    random_x_mean = gene_layer[-1].output_random_generation(
        input=random_z_output[-1], n_batch=n_batch)

    #5-2 stochastic layer
    # for this layer, the activation is None to get logvar
    if train_logvar:
        gene_layer.append(
            UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
                rng,
                image_shape=(batch_size, nkerns[-5], 16, 16),
                filter_shape=(n_channels, nkerns[-5], 5, 5),
                poolsize=(2, 2),
                border_mode='same',
                activation=None))
        l += [1, 2]
        d += [1, 1]
        x_logvar = gene_layer[-1].output(input=z_output[-1])
        random_x_logvar = gene_layer[-1].output_random_generation(
            input=random_z_output[-1], n_batch=n_batch)
    else:
        x_logvar = theano.shared(
            np.ones((batch_size, n_channels, dim_h, dim_w), dtype='float32'))
        random_x_logvar = theano.shared(
            np.ones((n_batch, n_channels, dim_h, dim_w), dtype='float32'))

    gene_layer.append(
        NoParamsGaussianVisiable.NoParamsGaussianVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=x_mean, logvar=x_logvar, data=input_x)
    random_x = gene_layer[-1].sample_x(rng_share=rng_share,
                                       mean=random_x_mean,
                                       logvar=random_x_logvar)

    x_denoised = p_label * x + (1 - p_label) * x_mean.flatten(2)
    mse = ((x - x_denoised)**2).sum() / pertub_number

    params = []
    for g in gene_layer:
        params += g.params
    for r in recg_layer:
        params += r.params
    '''
    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    '''
    '''
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x_pertub: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    '''
    test_activations = theano.function(inputs=[x_pertub],
                                       outputs=T.concatenate(activations,
                                                             axis=1),
                                       givens={drop: np.cast['int32'](0)})

    imputation_model = theano.function(
        inputs=[index, x_pertub],
        outputs=[x_denoised, mse],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            p_label: pertub_label[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
        })

    ##################
    # Pretrain MODEL #
    ##################
    model_epoch = 100
    if os.environ.has_key('model_epoch'):
        model_epoch = int(os.environ['model_epoch'])
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        if model_epoch == -1:
            pre_train = np.load(predir + 'best-model.npz')
        else:
            pre_train = np.load(predir + 'model-' + str(model_epoch) + '.npz')
        pre_train = pre_train['model']
        if ctype == 'cva':
            for (para, pre) in zip(params, pre_train):
                para.set_value(pre)
        elif ctype == 'cmmva':
            for (para, pre) in zip(params, pre_train[:-2]):
                para.set_value(pre)
        else:
            exit()
    else:
        exit()

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    scale = False
    epoch = 0
    n_visualization = 900
    pixel_max = pixel_max[:n_visualization]
    pixel_min = pixel_min[:n_visualization]
    output = np.ones((n_visualization, visualization_times + 2,
                      n_channels * dim_input[0] * dim_input[1]))
    output[:, 0, :] = test_set_x.get_value()[:n_visualization, :]
    output[:, 1, :] = test_set_x_pertub.get_value()[:n_visualization, :]

    image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(
        output[:, 0, :].T, pixel_max, pixel_min),
                                     dim_input,
                                     colorImg=colorImg,
                                     scale=scale)
    image.save(logdir + 'data.png', 'PNG')
    image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(
        output[:, 1, :].T, pixel_max, pixel_min),
                                     dim_input,
                                     colorImg=colorImg,
                                     scale=scale)
    image.save(logdir + 'data_pertub.png', 'PNG')

    tmp = test_set_x_pertub.get_value()

    while epoch < denoise_times:
        epoch = epoch + 1
        for i in xrange(n_test_batches):
            d, m = imputation_model(i,
                                    tmp[i * batch_size:(i + 1) * batch_size])
            tmp[i * batch_size:(i + 1) * batch_size] = np.asarray(d)
        if epoch <= visualization_times:
            output[:, epoch + 1, :] = tmp[:n_visualization, :]

        image = paramgraphics.mat_to_img(paramgraphics.scale_max_min(
            tmp[:n_visualization, :].T, pixel_max, pixel_min),
                                         dim_input,
                                         colorImg=colorImg,
                                         scale=scale)
        image.save(logdir + 'procedure-' + str(epoch) + '.png', 'PNG')
        np.savez(logdir + 'procedure-' + str(epoch), tmp=tmp)
    '''
    image = paramgraphics.mat_to_img((output.reshape(-1,32*32*3)).T, dim_input, colorImg=colorImg, tile_shape=(n_visualization,22), scale=scale)
    image.save(logdir+'output.png', 'PNG')
    np.savez(logdir+'output', output=output)
    '''
    '''
Exemplo n.º 6
0
def cmmva_6layer_dropout_mnist_60000(seed=0, start_layer=0, end_layer=1, dropout_flag=1, drop_inverses_flag=0, learning_rate=3e-5, predir=None, n_batch=144,
             dataset='mnist.pkl.gz', batch_size=500, nkerns=[20, 50], n_hidden=[500, 50]):

    """
    Implementation of convolutional MMVA
    """    
    #cp->cd->cpd->cd->c
    nkerns=[32, 32, 64, 64, 64]
    drops=[1, 0, 1, 0, 0, 1]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden=[500, 50]
    drop_inverses=[1,]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28
    
    if dataset=='mnist.pkl.gz':
        dim_input=(28, 28)
        colorImg=False
    D = 1.0
    C = 1.0
    if os.environ.has_key('C'):
        C = np.cast['float32'](float((os.environ['C'])))
    if os.environ.has_key('D'):
        D = np.cast['float32'](float((os.environ['D'])))
    color.printRed('D '+str(D)+' C '+str(C))

    logdir = 'results/supervised/cmmva/mnist/cmmva_6layer_60000_'+str(nkerns)+str(n_hidden)+'_D_'+str(D)+'_C_'+str(C)+'_'+str(learning_rate)+'_'
    if predir is not None:
        logdir +='pre_'
    if dropout_flag == 1:
        logdir += ('dropout_'+str(drops)+'_')
    if drop_inverses_flag==1:
        logdir += ('inversedropout_'+str(drop_inverses)+'_')
    logdir += str(int(time.time()))+'/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cmmva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir, 'predir', predir
        print >>f, 'cmmva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    y_matrix = T.imatrix('y_matrix')
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))
    
    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(ConvMaxPool.ConvMaxPool(
            rng,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2),
            border_mode='valid',
            activation=activation
        ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    #3
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[1], 12, 12),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='valid', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[2], 5, 5),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[3], 5, 5),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
   
    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 5 * 5 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    
    features = T.concatenate(activations[start_layer:end_layer], axis=1)
    color.printRed('feature dimension: '+str(np.sum(n_hidden[start_layer:end_layer])))
    
    classifier = Pegasos.Pegasos(
            input= features,
            rng=rng,
            n_in=np.sum(n_hidden[start_layer:end_layer]),
            n_out=10,
            weight_decay=0,
            loss=1,
            std=1e-2
        )

    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=n_hidden[0],
            n_out = n_hidden[1],
            activation=None
        ))

    z = recg_layer[-1].sample_z(rng_share)


    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[1],
            n_out = n_hidden[0],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 5*5*nkerns[-1],
            activation=activation
        ))

    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-1], 5, 5),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-2], 5, 5),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(2, 2),
            border_mode='full', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-3], 12, 12),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-4], 12, 12),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer 
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-5], 12, 12),
            filter_shape=(1, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='full', 
            activation=nonlinearity.sigmoid
        ))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))
   
    gene_layer.append(NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)


    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    lowerbound = (
        (logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum()
    )

    hinge_loss = classifier.hinge_loss(10, y, y_matrix) * batch_size

    #
    # D is redundent, you could just set D = 1 and tune C and weight decay parameters
    # beacuse AdaM is scale-invariant
    #
    cost = D * lowerbound - C * hinge_loss #- classifier.L2_reg
    
    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (- recg_layer[-1].logqz.sum())

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params
    params+=classifier.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay=1.0/n_train_batches
    epsilon=1e-8
    
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r, 
        decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=epsilon)
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'AdaM', learning_rate, weight_decay, epsilon
    updates = get_optimizer(params,gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, px, pz, qz, hinge_loss, cost],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2), random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )
    
    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=[classifier.errors(y), lowerbound, hinge_loss, cost],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )
    # end-snippet-5

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'model.npz')
        pre_train = pre_train['model']
        # params include w and b, exclude it
        for (para, pre) in zip(params[:-2], pre_train):
            #print pre.shape
            para.set_value(pre)
        tmp =  [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp[1:5]
    
    # valid_error test_error  epochs
    predy_test_stats = [1, 1, 0]
    predy_valid_stats = [1, 1, 0]

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, '+str(threshold) + 
        ' generatition_frequency, '+str(generatition_frequency)
        +' validation_frequency, '+str(validation_frequency))
    done_looping = False
    decay_epochs=500
    n_epochs=600

    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        train_error = 0
        train_lowerbound = 0
        train_hinge_loss = 0
        train_obj = 0
        
        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))

        tmp_start1 = time.clock()
        for minibatch_index in xrange(n_train_batches):
            #print n_train_batches
            e, l, h, o = train_model(minibatch_index)
            train_error += e
            train_lowerbound += l
            train_hinge_loss += h
            train_obj += o
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

        
        if math.isnan(train_lowerbound):
            NaN_count+=1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------NaN_count:', NaN_count
            
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            tmp[0]*=batch_size
            print '------------------NaN check:', tmp
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '------------------NaN check:', tmp

            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, model[i].shape, np.mean(model[i]), np.var(model[i])
                    print >>f, np.max(model[i]), np.min(model[i])
                    print >>f, np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))

            best_before = np.load(logdir+'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            tmp[0]*=batch_size
            print '------------------', tmp
            continue

        n_train=n_train_batches*batch_size
        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', train_error / float(batch_size), train_lowerbound / float(n_train), train_hinge_loss / float(n_train), train_obj / float(n_train)
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, epoch, 'stochastic training error', train_error / float(batch_size), train_lowerbound / float(n_train), train_hinge_loss / float(n_train), train_obj / float(n_train)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()
            # compute zero-one loss on validation set
            #train_stats = [train_bound_without_dropout(i) for i
            #                     in xrange(n_train_batches)]
            #this_train_stats = np.mean(train_stats, axis=0)
            #this_train_stats[1:] = this_train_stats[1:]/ float(batch_size)

            test_stats = [test_model(i) for i in xrange(n_test_batches)]
            this_test_stats = np.mean(test_stats, axis=0)
            this_test_stats[1:] = this_test_stats[1:]/ float(batch_size)
            
            print epoch, 'test error', this_test_stats
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, epoch, 'test error', this_test_stats

        if epoch%100==0:
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                #print model[i].shape, np.mean(model[i]), np.var(model[i])
                            
            np.savez(logdir+'model-'+str(epoch), model=model)
                
        
        tmp_start4=time.clock()
        if epoch % generatition_frequency == 0:
            tail='-'+str(epoch)+'.png'
            random_z = np.random.standard_normal((n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T, dim_input, colorImg=colorImg)
            image.save(logdir+'samples'+tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg)
            image.save(logdir+'mean_samples'+tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4
        

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, '---------------NaN_count:', NaN_count
Exemplo n.º 7
0
def cva_6layer_dropout_mnist_60000(seed=0, dropout_flag=1, drop_inverses_flag=0, learning_rate=3e-4, predir=None, n_batch=144,
             dataset='mnist.pkl.gz', batch_size=500, nkerns=[20, 50], n_hidden=[500, 50]):

    """
    Implementation of convolutional VA
    """    
    #cp->cd->cpd->cd->c
    nkerns=[32, 32, 64, 64, 64]
    drops=[1, 0, 1, 0, 0]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 1, 2, 1]
    #modes=['same']*5
    n_hidden=[500, 50]
    drop_inverses=[1,]
    # 28->12->12->5->5/5*5*64->500->50->500->5*5*64/5->5->12->12->28
    
    if dataset=='mnist.pkl.gz':
        dim_input=(28, 28)
        colorImg=False

    logdir = 'results/supervised/cva/mnist/cva_6layer_mnist_60000'+str(nkerns)+str(n_hidden)+'_'+str(learning_rate)+'_'
    if predir is not None:
        logdir +='pre_'
    if dropout_flag == 1:
        logdir += ('dropout_'+str(drops)+'_')
    if drop_inverses_flag==1:
        logdir += ('inversedropout_'+str(drop_inverses)+'_')
    logdir += str(int(time.time()))+'/'

    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir
    print 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir, 'predir', predir
        print >>f, 'cva_6layer_mnist_60000', nkerns, n_hidden, seed, drops, drop_inverses, dropout_flag, drop_inverses_flag

    datasets = datapy.load_data_gpu_60000(dataset, have_matrix=True)

    train_set_x, train_set_y, train_y_matrix = datasets[0]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[1]
    test_set_x, test_set_y, test_y_matrix = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    random_z = T.matrix('random_z')

    drop = T.iscalar('drop')
    drop_inverse = T.iscalar('drop_inverse')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)
    input_x = x.reshape((batch_size, 1, 28, 28))
    
    recg_layer = []
    cnn_output = []

    #1
    recg_layer.append(ConvMaxPool.ConvMaxPool(
            rng,
            image_shape=(batch_size, 1, 28, 28),
            filter_shape=(nkerns[0], 1, 5, 5),
            poolsize=(2, 2),
            border_mode='valid',
            activation=activation
        ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))

    #2
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    #3
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[1], 12, 12),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='valid', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    #4
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[2], 5, 5),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    #5
    recg_layer.append(ConvMaxPool.ConvMaxPool(
        rng,
        image_shape=(batch_size, nkerns[3], 5, 5),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []

    #1
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 5 * 5 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))

    #stochastic layer
    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=n_hidden[0],
            n_out = n_hidden[1],
            activation=None
        ))

    z = recg_layer[-1].sample_z(rng_share)


    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[1],
            n_out = n_hidden[0],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))

    #2
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 5*5*nkerns[-1],
            activation=activation
        ))

    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 5, 5))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 5, 5))

    #1
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-1], 5, 5),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-2], 5, 5),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(2, 2),
            border_mode='full', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-3], 12, 12),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-4], 12, 12),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5 stochastic layer 
    # for the last layer, the nonliearity should be sigmoid to achieve mean of Bernoulli
    gene_layer.append(UnpoolConvNon.UnpoolConvNon(
            rng,
            image_shape=(batch_size, nkerns[-5], 12, 12),
            filter_shape=(1, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='full', 
            activation=nonlinearity.sigmoid
        ))

    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))
   
    gene_layer.append(NoParamsBernoulliVisiable.NoParamsBernoulliVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=z_output[-1], data=input_x)


    # 4-D tensor of random generation
    random_x_mean = random_z_output[-1]
    random_x = gene_layer[-1].sample_x(rng_share, random_x_mean)

    #L = (logpx + logpz - logqz).sum()
    cost = (
        (logpx + recg_layer[-1].logpz - recg_layer[-1].logqz).sum()
    )
    
    px = (logpx.sum())
    pz = (recg_layer[-1].logpz.sum())
    qz = (- recg_layer[-1].logqz.sum())

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params
    gparams = [T.grad(cost, param) for param in params]

    weight_decay=1.0/n_train_batches
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    #get_optimizer = optimizer.get_adam_optimizer(learning_rate=learning_rate)
    get_optimizer = optimizer.get_adam_optimizer_max(learning_rate=l_r, 
        decay1=0.1, decay2=0.001, weight_decay=weight_decay, epsilon=1e-8)
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'AdaM', learning_rate, weight_decay
    updates = get_optimizer(params,gparams)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            #y: test_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: test_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=cost,
        #outputs=layer[-1].errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            #y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            #y_matrix: valid_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )
    
    '''
    Save parameters and activations
    '''

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #drop_inverse: np.cast['int32'](0)
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # compiling a Theano function `train_model` that returns the cost, but
    # in the same time updates the parameter of the model based on the rules
    # defined in `updates`

    debug_model = theano.function(
        inputs=[index],
        outputs=[cost, px, pz, qz],
        #updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    random_generation = theano.function(
        inputs=[random_z],
        outputs=[random_x_mean.flatten(2), random_x.flatten(2)],
        givens={
            #drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_bound_without_dropout = theano.function(
        inputs=[index],
        outputs=cost,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            drop_inverse: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            #y: train_set_y[index * batch_size: (index + 1) * batch_size],
            #y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag),
            drop_inverse: np.cast['int32'](drop_inverses_flag)
        }
    )

    ##################
    # Pretrain MODEL #
    ##################
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'model.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        tmp =  [debug_model(i) for i in xrange(n_train_batches)]
        tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
        print '------------------', tmp

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_bound = -1000000.0
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    NaN_count = 0
    epoch = 0
    threshold = 0
    validation_frequency = 1
    generatition_frequency = 10
    if predir is not None:
        threshold = 0
    color.printRed('threshold, '+str(threshold) + 
        ' generatition_frequency, '+str(generatition_frequency)
        +' validation_frequency, '+str(validation_frequency))
    done_looping = False
    n_epochs = 600
    decay_epochs = 500

    '''
    print 'test initialization...'
    pre_model = parameters()
    for i in xrange(len(pre_model)):
        pre_model[i] = np.asarray(pre_model[i])
        print pre_model[i].shape, np.mean(pre_model[i]), np.var(pre_model[i])
    print 'end test...'
    '''
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        minibatch_avg_cost = 0
        
        tmp_start1 = time.clock()

        test_epoch = epoch - decay_epochs
        if test_epoch > 0 and test_epoch % 10 == 0:
            print l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,l_r.get_value()
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))


        for minibatch_index in xrange(n_train_batches):
            #print minibatch_index
            '''
            color.printRed('lalala')
            xxx = dims(minibatch_index)
            print xxx.shape
            '''
            #print n_train_batches
            minibatch_avg_cost += train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
        
        if math.isnan(minibatch_avg_cost):
            NaN_count+=1
            color.printRed("NaN detected. Reverting to saved best parameters")
            print '---------------NaN_count:', NaN_count
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, '---------------NaN_count:', NaN_count
            
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------NaN check:', tmp
            with open(logdir+'hook.txt', 'a') as f:
               print >>f, '------------------NaN check:', tmp
               
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
                print model[i].shape, np.mean(model[i]), np.var(model[i])
                print np.max(model[i]), np.min(model[i])
                print np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, model[i].shape, np.mean(model[i]), np.var(model[i])
                    print >>f, np.max(model[i]), np.min(model[i])
                    print >>f, np.all(np.isfinite(model[i])), np.any(np.isnan(model[i]))

            best_before = np.load(logdir+'model.npz')
            best_before = best_before['model']
            for (para, pre) in zip(params, best_before):
                para.set_value(pre)
            tmp =  [debug_model(i) for i in xrange(n_train_batches)]
            tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            print '------------------', tmp
            return

        #print 'optimization_time', time.clock() - tmp_start1
        print epoch, 'stochastic training error', minibatch_avg_cost / float(n_train_batches*batch_size)
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, epoch, 'stochastic training error', minibatch_avg_cost / float(n_train_batches*batch_size)

        if epoch % validation_frequency == 0:
            tmp_start2 = time.clock()

            test_losses = [test_model(i) for i
                                 in xrange(n_test_batches)]
            this_test_bound = np.mean(test_losses)/float(batch_size)
            
            #tmp =  [debug_model(i) for i
            #                     in xrange(n_train_batches)]
            #tmp = (np.asarray(tmp)).mean(axis=0) / float(batch_size)
            
            print epoch, 'test bound', this_test_bound
            #print tmp
            with open(logdir+'hook.txt', 'a') as f:
                print >>f, epoch, 'test bound', this_test_bound
            
        if epoch%100==0:    
            
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)
            np.savez(logdir+'model-'+str(epoch), model=model)
            
            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack((train_features, np.asarray(train_activations(i))))
            
            for i in xrange(n_valid_batches):
                if i == 0:
                    valid_features = np.asarray(valid_activations(i))
                else:
                    valid_features = np.vstack((valid_features, np.asarray(valid_activations(i))))

            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack((test_features, np.asarray(test_activations(i))))
            np.save(logdir+'train_features', train_features)
            np.save(logdir+'valid_features', valid_features)
            np.save(logdir+'test_features', test_features)
        
        tmp_start4=time.clock()
        if epoch % generatition_frequency == 0:
            tail='-'+str(epoch)+'.png'
            random_z = np.random.standard_normal((n_batch, n_hidden[-1])).astype(np.float32)
            _x_mean, _x = random_generation(random_z)
            #print _x.shape
            #print _x_mean.shape
            image = paramgraphics.mat_to_img(_x.T, dim_input, colorImg=colorImg)
            image.save(logdir+'samples'+tail, 'PNG')
            image = paramgraphics.mat_to_img(_x_mean.T, dim_input, colorImg=colorImg)
            image.save(logdir+'mean_samples'+tail, 'PNG')
        #print 'generation_time', time.clock() - tmp_start4

    end_time = time.clock()
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if NaN_count > 0:
        print '---------------NaN_count:', NaN_count
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, '---------------NaN_count:', NaN_count
Exemplo n.º 8
0
def deep_cnn_6layer_svhn_final_svm(learning_rate=0.01,
                                   n_epochs=500,
                                   dataset='svhngcn_var',
                                   batch_size=500,
                                   dropout_flag=1,
                                   seed=0,
                                   predir=None,
                                   preepoch=10,
                                   activation=None,
                                   weight_decay=1e-4):
    '''
    svhn
    '''
    n_channels = 3
    dim_w = 32
    dim_h = 32
    n_classes = 10

    epoch_threshold = 200
    if os.environ.has_key('epoch_threshold'):
        epoch_threshold = int(os.environ['epoch_threshold'])
    first_drop = 0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop = 1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1 = 96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2 = 96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med = 'adam'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    std = 2e-2
    if os.environ.has_key('std'):
        std = os.environ['std']
    pattern = 'hinge'
    if os.environ.has_key('pattern'):
        pattern = os.environ['pattern']
    Loss_L = 1
    if os.environ.has_key('Loss_L'):
        Loss_L = float(os.environ['Loss_L'])

    #cp->cd->cpd->cd->c
    nkerns = [nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops = [0, 1, 1, 1, 0, 1]
    drop_p = [1, first_drop, first_drop, first_drop, 1, last_drop]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 2, 1, 1]
    #modes=['same']*5

    logdir = 'results/supervised/cnn/svhn/deep_cnn_6layer_' + pattern + '_' + dataset + str(
        nkerns) + str(drops) + '_' + str(weight_decay) + '_' + str(
            learning_rate) + '_' + str(std) + '_' + str(Loss_L) + '_' + str(
                int(time.time())) + '/'
    if dropout_flag == 1:
        logdir = 'results/supervised/cnn/svhn/deep_cnn_6layer_' + pattern + '_' + dataset + str(
            drop_p) + str(nkerns) + str(drops) + '_' + str(
                weight_decay) + '_' + str(learning_rate) + '_' + str(
                    std) + '_' + str(Loss_L) + '_dropout_' + str(
                        int(time.time())) + '/'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir
    print 'deep_cnn_6layer_svm', nkerns, drops, drop_p, seed, dropout_flag
    print 'epoch_threshold', epoch_threshold, 'opt_med', opt_med
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'logdir:', logdir
        print >> f, 'epoch_threshold', epoch_threshold, 'opt_med', opt_med
        print >> f, 'deep_cnn_6layer_svm', nkerns, drops, drop_p, seed, dropout_flag

    rng = np.random.RandomState(0)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    color.printRed('dataset ' + dataset)

    datasets = datapy.load_data_svhn(dataset, have_matrix=True)
    train_set_x, train_set_y, train_y_matrix = datasets[0]
    test_set_x, test_set_y, test_y_matrix = datasets[1]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[2]
    #datasets = datapy.load_data_svhn(dataset, have_matrix=False)
    #train_set_x, train_set_y = datasets[0]
    #test_set_x, test_set_y = datasets[1]
    #valid_set_x, valid_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    '''
    dropout
    '''
    drop = T.iscalar('drop')

    y_matrix = T.imatrix(
        'y_matrix')  # labels, presented as 2D matrix of int labels

    print '... building the model'

    layer0_input = x.reshape((batch_size, n_channels, dim_h, dim_w))

    if activation == 'nonlinearity.relu':
        activation = nonlinearity.relu
    elif activation == 'nonlinearity.tanh':
        activation = nonlinearity.tanh
    elif activation == 'nonlinearity.softplus':
        activation = nonlinearity.softplus

    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, n_channels, dim_h, dim_w),
            filter_shape=(nkerns[0], n_channels, 5, 5),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation,
            std=std))
    if drops[0] == 1:
        cnn_output.append(recg_layer[-1].drop_output(layer0_input,
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(layer0_input))
    l += [1, 2]
    d += [1, 0]

    #2
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[0], 16, 16),
            filter_shape=(nkerns[1], nkerns[0], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation,
            std=std))
    if drops[1] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 0]

    #3
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[1], 16, 16),
            filter_shape=(nkerns[2], nkerns[1], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation,
            std=std))
    if drops[2] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 0]

    #4
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[2], 8, 8),
            filter_shape=(nkerns[3], nkerns[2], 3, 3),
            poolsize=(1, 1),
            border_mode='same',
            activation=activation,
            std=std))
    if drops[3] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))

    l += [1, 2]
    d += [1, 0]

    #5
    recg_layer.append(
        ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[3], 8, 8),
            filter_shape=(nkerns[4], nkerns[3], 3, 3),
            poolsize=(2, 2),
            border_mode='same',
            activation=activation,
            std=std))
    if drops[4] == 1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1],
                                                     drop=drop,
                                                     rng=rng_share,
                                                     p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l += [1, 2]
    d += [1, 0]

    feature = cnn_output[-1].flatten(2)

    # classify the values of the fully-connected sigmoidal layer
    '''
    large weight of pegasos to avoid gradient disappeared 
    '''
    std_pegasos = std
    weight_decay_pegasos = weight_decay
    classifier = Pegasos.Pegasos(input=feature,
                                 rng=rng,
                                 n_in=nkerns[-1] * 4 * 4,
                                 n_out=n_classes,
                                 weight_decay=0,
                                 loss=Loss_L,
                                 std=std_pegasos,
                                 pattern=pattern)
    #classifier = LogisticRegression.LogisticRegression(
    #        input=feature,
    #        n_in=nkerns[-1],
    #        n_out=n_classes
    #    )

    l += [1, 2]
    d += [weight_decay_pegasos / weight_decay, 0]
    # the cost we minimize during training is the NLL of the model
    cost = classifier.hinge_loss(n_classes, y, y_matrix)
    #cost = classifier.negative_log_likelihood(y)

    params = []
    for r in recg_layer:
        params += r.params
    params += classifier.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    if opt_med == 'adam':
        get_optimizer = optimizer_separated.get_adam_optimizer_min(
            learning_rate=l_r,
            decay1=0.1,
            decay2=0.001,
            weight_decay=weight_decay)
    elif opt_med == 'mom':
        get_optimizer = optimizer_separated.get_momentum_optimizer_min(
            learning_rate=l_r, weight_decay=weight_decay)
    updates = get_optimizer(w=params, g=grads, l=l, d=d)

    pog = []
    for (p, g) in zip(params, grads):
        pog.append(p.max())
        pog.append((p**2).mean())
        pog.append((g**2).mean())
        pog.append((T.sqrt(pog[-2] / pog[-1])) / 1e3)

    paramovergrad = theano.function(
        inputs=[index],
        outputs=pog,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix:
            train_y_matrix[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        })

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    valid_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    train_activations = theano.function(
        inputs=[index],
        outputs=feature,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    test_activations = theano.function(
        inputs=[index],
        outputs=feature,
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](0)
        })

    train_model = theano.function(
        inputs=[index],
        outputs=[cost, classifier.errors(y)],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size],
            y_matrix:
            train_y_matrix[index * batch_size:(index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        })

    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir + 'svhn_model-' + str(preepoch) + '.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        this_test_losses = [test_model(i) for i in xrange(n_test_batches)]
        this_test_score = np.mean(this_test_losses)
        #print predir
        print 'preepoch', preepoch, 'prescore', this_test_score
        with open(logdir + 'hook.txt', 'a') as f:
            print >> f, predir
            print >> f, 'preepoch', preepoch, 'prescore', this_test_score

    print '... training'
    validation_frequency = n_train_batches / 10
    best_train_loss = 10000.0
    best_valid_score = 10000.0
    best_epoch = 0
    test_score = 0
    start_time = time.clock()
    epoch = 0
    n_epochs = 100
    test_epochs = 40
    record = 0
    '''
    pog = [paramovergrad(i) for i in xrange(n_train_batches)]
    pog = np.mean(pog, axis=0)
    #print 'before train ----------pog', pog
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'before train ----------pog', pog
    '''

    while (epoch < n_epochs):
        epoch = epoch + 1
        tmp1 = time.clock()
        preW = None
        currentW = None
        minibatch_avg_cost = 0
        train_error = 0
        if (epoch - record) >= 7:
            record = epoch
            l_r.set_value(np.cast['float32'](l_r.get_value() / 3.0))
            print '---------', epoch, l_r.get_value()
            with open(logdir + 'hook.txt', 'a') as f:
                print >> f, '---------', epoch, l_r.get_value()
        '''
        decay_epoch = epoch - test_epochs
        if decay_epoch > 0 and decay_epoch % 30==0:
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))
            print '---------', epoch, l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,'---------', epoch, l_r.get_value()
        '''

        if epoch % 5 == 0:
            ''' 
            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack((train_features, np.asarray(train_activations(i))))
            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack((test_features, np.asarray(test_activations(i))))
            
            np.save(logdir+'train_features-'+str(epoch), train_features)
            np.save(logdir+'test_features-'+str(epoch), test_features)
            '''
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)

            np.savez(logdir + 'svhn_model-' + str(epoch), model=model)

        for minibatch_index in xrange(n_train_batches):

            if (minibatch_index < 11):
                preW = currentW
                currentW = parameters()
                for i in xrange(len(currentW)):
                    currentW[i] = np.asarray(currentW[i]).astype(np.float32)

                if preW is not None:
                    for (c, p) in zip(currentW, preW):
                        #print minibatch_index, (c**2).mean(), ((c-p)**2).mean(), np.sqrt((c**2).mean()/((c-p)**2).mean())
                        with open(logdir + 'delta_w.txt', 'a') as f:
                            print >> f, minibatch_index, (c**2).mean(), ((
                                c - p)**2).mean(), np.sqrt(
                                    (c**2).mean() / ((c - p)**2).mean())

            co, te = train_model(minibatch_index)
            minibatch_avg_cost += co
            train_error += te
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                print epoch, minibatch_index
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, epoch, minibatch_index
                print 'Stochastic hinge loss and training error', minibatch_avg_cost / float(
                    minibatch_index), train_error / float(minibatch_index)
                #print 'time', time.clock() - tmp1
                with open(logdir + 'hook.txt', 'a') as f:
                    #    print >>f, 'pog', pog
                    print >> f, 'Stochastic hinge loss and training error', minibatch_avg_cost / float(
                        minibatch_index), train_error / float(minibatch_index)
                    #print >>f,'time', time.clock() - tmp1

                this_valid_losses = [
                    valid_model(i) for i in xrange(n_valid_batches)
                ]
                this_valid_score = np.mean(this_valid_losses)

                print('epoch %i, minibatch %i/%i, valid error %f %%' % (
                    epoch,
                    minibatch_index + 1,
                    n_train_batches,
                    #this_validation_loss * 100,
                    this_valid_score * 100.))
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, (
                        'epoch %i, minibatch %i/%i, valid error %f %%' % (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            #this_validation_loss * 100,
                            this_valid_score * 100.))
                if this_valid_score < best_valid_score:
                    this_test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    this_test_score = np.mean(this_test_losses)
                    best_valid_score = this_valid_score
                    test_score = this_test_score
                    best_epoch = epoch
                    record = epoch
                    print 'Update best model', this_test_score
                    with open(logdir + 'hook.txt', 'a') as f:
                        print >> f, 'Update best model', this_test_score
                print 'So far best model', best_epoch, test_score
                with open(logdir + 'hook.txt', 'a') as f:
                    print >> f, 'So far best model', best_epoch, test_score

        pogzero = np.asarray(paramovergrad(0))
        #print 'pogzero', pogzero
        with open(logdir + 'pog.txt', 'a') as f:
            print >> f, 'pogzero', pogzero

        #pog = [paramovergrad(i) for i in xrange(n_train_batches)]
        #pog = np.mean(pog, axis=0)
        #print 'pog', pog

    print 'So far best model', test_score
    with open(logdir + 'hook.txt', 'a') as f:
        print >> f, 'So far best model', test_score

    end_time = time.clock()
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
Exemplo n.º 9
0
def c_6layer_svhn_features(learning_rate=0.01,
            n_epochs=600,
            dataset='svhngcn_var',
            batch_size=1000,
            dropout_flag=1,
            seed=0,
            predir=None,
            activation=None,
            n_batch=625,
            weight_decay=1e-4,
            super_predir=None,
            super_preepoch=None):
    """
    Missing data imputation
    """      
    '''
    svhn
    '''
    n_channels = 3
    colorImg = True
    dim_w = 32
    dim_h = 32
    dim_input=(dim_h, dim_w)
    n_classes = 10

    first_drop=0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop=1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1=96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2=96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med='mom'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    train_logvar=True
    if os.environ.has_key('train_logvar'):
        train_logvar = bool(int(os.environ['train_logvar']))
    dataset='svhnlcn'
    if os.environ.has_key('dataset'):
        dataset = os.environ['dataset']
    n_z=256
    if os.environ.has_key('n_z'):
        n_z = int(os.environ['n_z'])

    #cp->cd->cpd->cd->c
    nkerns=[nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops=[0, 1, 1, 1, 0, 1]
    drop_p=[1, first_drop, first_drop, first_drop, 1, last_drop]
    n_hidden=[n_z]
    
    logdir = 'results/supervised/cva/svhn_features/cva_6layer_svhn'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir, 'predir', predir

    color.printRed('dataset '+dataset)

    datasets = datapy.load_data_svhn(dataset, have_matrix=False)
    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]
    valid_set_x, valid_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    random_z = T.matrix('random_z')

    p_label = T.matrix('p_label')

    drop = T.iscalar('drop')
    
    activation = nonlinearity.relu

    rng = np.random.RandomState(seed)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    input_x = x.reshape((batch_size, n_channels, dim_h, dim_w))
    
    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, n_channels, dim_h, dim_w),
        filter_shape=(nkerns[0], n_channels, 5, 5),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(input=input_x, drop=drop, rng=rng_share, p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(input=input_x))
    l+=[1, 2]
    d+=[1, 1]

    #2
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[0], 16, 16),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]
    
    #3
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[1], 16, 16),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]

    #4
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[2], 8, 8),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    l+=[1, 2]
    d+=[1, 1]

    #5
    '''
    --------------------- (2,2) or (4,4)
    '''
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[3], 8, 8),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 1]

    mlp_input_x = cnn_output[-1].flatten(2)

    activations = []
    activations.append(mlp_input_x)
    #1
    '''
    ---------------------No MLP
    '''
    '''
    recg_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in= 4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=activation
        ))
    if drops[-1]==1:
        activations.append(recg_layer[-1].drop_output(input=mlp_input_x, drop=drop, rng=rng_share, p=drop_p[-1]))
    else:
        activations.append(recg_layer[-1].output(input=mlp_input_x))
    '''

    #stochastic layer
    recg_layer.append(GaussianHidden.GaussianHidden(
            rng=rng,
            input=activations[-1],
            n_in=4 * 4 * nkerns[-1],
            n_out=n_hidden[0],
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 1]
    l+=[1, 2]
    d+=[1, 1]

    z = recg_layer[-1].sample_z(rng_share)

    gene_layer = []
    z_output = []
    random_z_output = []

    #1
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    
    z_output.append(gene_layer[-1].output(input=z))
    random_z_output.append(gene_layer[-1].output(input=random_z))
    l+=[1, 2]
    d+=[1, 1]

    #2
    '''
    gene_layer.append(FullyConnected.FullyConnected(
            rng=rng,
            n_in=n_hidden[0],
            n_out = 4*4*nkerns[-1],
            activation=activation
        ))
    if drop_inverses[0]==1:
        z_output.append(gene_layer[-1].drop_output(input=z_output[-1], drop=drop_inverse, rng=rng_share))
        random_z_output.append(gene_layer[-1].drop_output(input=random_z_output[-1], drop=drop_inverse, rng=rng_share))
    else:
        z_output.append(gene_layer[-1].output(input=z_output[-1]))
        random_z_output.append(gene_layer[-1].output(input=random_z_output[-1]))
    '''

    input_z = z_output[-1].reshape((batch_size, nkerns[-1], 4, 4))
    input_random_z = random_z_output[-1].reshape((n_batch, nkerns[-1], 4, 4))

    #1
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-1], 4, 4),
            filter_shape=(nkerns[-2], nkerns[-1], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=input_z))
    random_z_output.append(gene_layer[-1].output_random_generation(input=input_random_z, n_batch=n_batch))
    
    #2
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-2], 8, 8),
            filter_shape=(nkerns[-3], nkerns[-2], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #3
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-3], 8, 8),
            filter_shape=(nkerns[-4], nkerns[-3], 3, 3),
            poolsize=(2, 2),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #4
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-4], 16, 16),
            filter_shape=(nkerns[-5], nkerns[-4], 3, 3),
            poolsize=(1, 1),
            border_mode='same', 
            activation=activation
        ))
    l+=[1, 2]
    d+=[1, 1]
    z_output.append(gene_layer[-1].output(input=z_output[-1]))
    random_z_output.append(gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch))

    #5-1 stochastic layer 
    # for this layer, the activation is None to get a Guassian mean
    gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
            rng,
            image_shape=(batch_size, nkerns[-5], 16, 16),
            filter_shape=(n_channels, nkerns[-5], 5, 5),
            poolsize=(2, 2),
            border_mode='same', 
            activation=None
        ))
    l+=[1, 2]
    d+=[1, 1]
    x_mean=gene_layer[-1].output(input=z_output[-1])
    random_x_mean=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)


    #5-2 stochastic layer 
    # for this layer, the activation is None to get logvar
    if train_logvar:
        gene_layer.append(UnpoolConvNon_GauInit_DNN.UnpoolConvNon_GauInit_DNN(
                rng,
                image_shape=(batch_size, nkerns[-5], 16, 16),
                filter_shape=(n_channels, nkerns[-5], 5, 5),
                poolsize=(2, 2),
                border_mode='same', 
                activation=None
            ))
        l+=[1, 2]
        d+=[1, 1]
        x_logvar=gene_layer[-1].output(input=z_output[-1])
        random_x_logvar=gene_layer[-1].output_random_generation(input=random_z_output[-1], n_batch=n_batch)
    else:
        x_logvar = theano.shared(np.ones((batch_size, n_channels, dim_h, dim_w), dtype='float32'))
        random_x_logvar = theano.shared(np.ones((n_batch, n_channels, dim_h, dim_w), dtype='float32'))

    gene_layer.append(NoParamsGaussianVisiable.NoParamsGaussianVisiable(
            #rng=rng,
            #mean=z_output[-1],
            #data=input_x,
        ))
    logpx = gene_layer[-1].logpx(mean=x_mean, logvar=x_logvar, data=input_x)
    random_x = gene_layer[-1].sample_x(rng_share=rng_share, mean=random_x_mean, logvar=random_x_logvar)

    params=[]
    for g in gene_layer:
        params+=g.params
    for r in recg_layer:
        params+=r.params

    train_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    valid_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    test_activations = theano.function(
        inputs=[index],
        outputs=T.concatenate(activations, axis=1),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0),
            #y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )


    ##################
    # Pretrain MODEL #
    ##################
    model_epoch = 100
    ctype = 'cva'
    if os.environ.has_key('model_epoch'):
        model_epoch = int(os.environ['model_epoch'])
    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        if model_epoch == -1:
            pre_train = np.load(predir+'best-model.npz')
        else:
            pre_train = np.load(predir+'model-'+str(model_epoch)+'.npz')
        pre_train = pre_train['model']
        if ctype == 'cva':
            for (para, pre) in zip(params, pre_train):
                para.set_value(pre)
        elif ctype == 'cmmva':
            for (para, pre) in zip(params, pre_train[:-2]):
                para.set_value(pre)
        else:
            exit()
    else:
        exit()

    ###############
    # TRAIN MODEL #
    ###############
    print 'extract features: valid'
    for i in xrange(n_valid_batches):
        if i == 0:
            valid_features = np.asarray(valid_activations(i))
        else:
            valid_features = np.vstack((valid_features, np.asarray(valid_activations(i))))
    #print 'valid'
    print 'extract features: test'
    for i in xrange(n_test_batches):
        if i == 0:
            test_features = np.asarray(test_activations(i))
        else:
            test_features = np.vstack((test_features, np.asarray(test_activations(i))))
    
    f = file(logdir+"svhn_features.bin","wb")
    np.save(f,valid_features)
    np.save(f,test_features)
    f.close()
    #print 'test'

    print 'extract features: train'
    f = file(logdir+"svhn_train_features.bin","wb")
    for i in xrange(n_train_batches):
        #print n_train_batches
        #print i
        train_features = np.asarray(train_activations(i))
        np.save(f,train_features) 
    f.close()
Exemplo n.º 10
0
def deep_cnn_6layer_svhn_final_svm(learning_rate=0.01,
            n_epochs=500,
            dataset='svhngcn_var',
            batch_size=500,
            dropout_flag=1,
            seed=0,
            predir=None,
            preepoch=10,
            activation=None,
            weight_decay=1e-4):
    
    '''
    svhn
    '''
    n_channels = 3
    dim_w = 32
    dim_h = 32
    n_classes = 10
    
    epoch_threshold = 200
    if os.environ.has_key('epoch_threshold'):
        epoch_threshold = int(os.environ['epoch_threshold'])
    first_drop=0.6
    if os.environ.has_key('first_drop'):
        first_drop = float(os.environ['first_drop'])
    last_drop=1
    if os.environ.has_key('last_drop'):
        last_drop = float(os.environ['last_drop'])
    nkerns_1=96
    if os.environ.has_key('nkerns_1'):
        nkerns_1 = int(os.environ['nkerns_1'])
    nkerns_2=96
    if os.environ.has_key('nkerns_2'):
        nkerns_2 = int(os.environ['nkerns_2'])
    opt_med='adam'
    if os.environ.has_key('opt_med'):
        opt_med = os.environ['opt_med']
    std = 2e-2
    if os.environ.has_key('std'):
        std = os.environ['std']
    pattern = 'hinge'
    if os.environ.has_key('pattern'):
        pattern = os.environ['pattern']
    Loss_L = 1
    if os.environ.has_key('Loss_L'):
        Loss_L = float(os.environ['Loss_L'])

    #cp->cd->cpd->cd->c
    nkerns=[nkerns_1, nkerns_1, nkerns_1, nkerns_2, nkerns_2]
    drops=[0, 1, 1, 1, 0, 1]
    drop_p=[1, first_drop, first_drop, first_drop, 1, last_drop]
    #skerns=[5, 3, 3, 3, 3]
    #pools=[2, 1, 2, 1, 1]
    #modes=['same']*5

    
    logdir = 'results/supervised/cnn/svhn/deep_cnn_6layer_'+pattern+'_'+dataset+str(nkerns)+str(drops)+'_'+str(weight_decay)+'_'+str(learning_rate)+'_'+str(std)+'_'+str(Loss_L)+'_'+str(int(time.time()))+'/'
    if dropout_flag==1:
        logdir = 'results/supervised/cnn/svhn/deep_cnn_6layer_'+pattern+'_'+dataset+str(drop_p)+str(nkerns)+str(drops)+'_'+str(weight_decay)+'_'+str(learning_rate)+'_'+str(std)+'_'+str(Loss_L)+'_dropout_'+str(int(time.time()))+'/'
    if not os.path.exists(logdir): os.makedirs(logdir)
    print 'logdir:', logdir
    print 'deep_cnn_6layer_svm', nkerns, drops, drop_p, seed, dropout_flag
    print 'epoch_threshold', epoch_threshold, 'opt_med', opt_med
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'logdir:', logdir
        print >>f, 'epoch_threshold', epoch_threshold, 'opt_med', opt_med
        print >>f, 'deep_cnn_6layer_svm', nkerns, drops, drop_p, seed, dropout_flag

    rng = np.random.RandomState(0)
    rng_share = theano.tensor.shared_randomstreams.RandomStreams(0)

    color.printRed('dataset '+dataset)

    datasets = datapy.load_data_svhn(dataset, have_matrix=True)
    train_set_x, train_set_y, train_y_matrix = datasets[0]
    test_set_x, test_set_y, test_y_matrix = datasets[1]
    valid_set_x, valid_set_y, valid_y_matrix = datasets[2]
    #datasets = datapy.load_data_svhn(dataset, have_matrix=False)
    #train_set_x, train_set_y = datasets[0]
    #test_set_x, test_set_y = datasets[1]
    #valid_set_x, valid_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    '''
    dropout
    '''
    drop = T.iscalar('drop')

    y_matrix = T.imatrix('y_matrix') # labels, presented as 2D matrix of int labels 

    print '... building the model'

    layer0_input = x.reshape((batch_size, n_channels, dim_h, dim_w))
    
    if activation =='nonlinearity.relu':
        activation = nonlinearity.relu
    elif activation =='nonlinearity.tanh':
        activation = nonlinearity.tanh
    elif activation =='nonlinearity.softplus':
        activation = nonlinearity.softplus
    
    recg_layer = []
    cnn_output = []
    l = []
    d = []

    #1
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, n_channels, dim_h, dim_w),
        filter_shape=(nkerns[0], n_channels, 5, 5),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[0]==1:
        cnn_output.append(recg_layer[-1].drop_output(layer0_input, drop=drop, rng=rng_share, p=drop_p[0]))
    else:
        cnn_output.append(recg_layer[-1].output(layer0_input))
    l+=[1, 2]
    d+=[1, 0]

    #2
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[0], 16, 16),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[1]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[1]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 0]

    #3
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[1], 16, 16),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[2]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[2]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 0]

    #4
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[2], 8, 8),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(1, 1),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[3]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[3]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    
    l+=[1, 2]
    d+=[1, 0]

    #5
    recg_layer.append(ConvMaxPool_GauInit_DNN.ConvMaxPool_GauInit_DNN(
        rng,
        image_shape=(batch_size, nkerns[3], 8, 8),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(2, 2),
        border_mode='same', 
        activation=activation,
        std=std
    ))
    if drops[4]==1:
        cnn_output.append(recg_layer[-1].drop_output(cnn_output[-1], drop=drop, rng=rng_share, p=drop_p[4]))
    else:
        cnn_output.append(recg_layer[-1].output(cnn_output[-1]))
    l+=[1, 2]
    d+=[1, 0]

    feature = cnn_output[-1].flatten(2)

    # classify the values of the fully-connected sigmoidal layer
    
    '''
    large weight of pegasos to avoid gradient disappeared 
    '''
    std_pegasos=std
    weight_decay_pegasos=weight_decay
    classifier = Pegasos.Pegasos(input=feature, rng=rng, n_in=nkerns[-1]*4*4, n_out=n_classes, weight_decay=0, loss=Loss_L, std=std_pegasos, pattern=pattern)
    #classifier = LogisticRegression.LogisticRegression(
    #        input=feature,
    #        n_in=nkerns[-1],
    #        n_out=n_classes
    #    )

    l+=[1, 2]
    d+=[weight_decay_pegasos / weight_decay, 0]
    # the cost we minimize during training is the NLL of the model
    cost = classifier.hinge_loss(n_classes, y, y_matrix)
    #cost = classifier.negative_log_likelihood(y)

    params=[]
    for r in recg_layer:
        params+=r.params
    params += classifier.params
    
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    l_r = theano.shared(np.asarray(learning_rate, dtype=np.float32))
    if opt_med=='adam':
        get_optimizer = optimizer_separated.get_adam_optimizer_min(learning_rate=l_r, decay1 = 0.1, decay2 = 0.001, weight_decay=weight_decay)
    elif opt_med=='mom':
        get_optimizer = optimizer_separated.get_momentum_optimizer_min(learning_rate=l_r, weight_decay=weight_decay)
    updates = get_optimizer(w=params,g=grads, l=l, d=d)
    
    pog = []
    for (p,g) in zip(params, grads):
        pog.append(p.max())
        pog.append((p**2).mean())
        pog.append((g**2).mean())
        pog.append((T.sqrt(pog[-2] / pog[-1]))/ 1e3)

    paramovergrad = theano.function(
        inputs=[index],
        outputs=pog,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        }
    )

    parameters = theano.function(
        inputs=[],
        outputs=params,
    )

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    valid_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    train_activations = theano.function(
        inputs=[index],
        outputs=feature,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )
    
    test_activations = theano.function(
        inputs=[index],
        outputs=feature,
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](0)
        }
    )

    train_model = theano.function(
        inputs=[index],
        outputs=[cost, classifier.errors(y)],
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            y_matrix: train_y_matrix[index * batch_size: (index + 1) * batch_size],
            drop: np.cast['int32'](dropout_flag)
        }
    )


    if predir is not None:
        color.printBlue('... setting parameters')
        color.printBlue(predir)
        pre_train = np.load(predir+'svhn_model-'+str(preepoch)+'.npz')
        pre_train = pre_train['model']
        for (para, pre) in zip(params, pre_train):
            para.set_value(pre)
        this_test_losses = [test_model(i) for i in xrange(n_test_batches)]
        this_test_score = np.mean(this_test_losses)
        #print predir
        print 'preepoch', preepoch, 'prescore', this_test_score
        with open(logdir+'hook.txt', 'a') as f:
            print >>f, predir
            print >>f, 'preepoch', preepoch, 'prescore', this_test_score

        

    print '... training'
    validation_frequency = n_train_batches/10
    best_train_loss = 10000.0
    best_valid_score = 10000.0
    best_epoch = 0
    test_score = 0
    start_time = time.clock()
    epoch = 0
    n_epochs = 100
    test_epochs = 40
    record = 0
    

    '''
    pog = [paramovergrad(i) for i in xrange(n_train_batches)]
    pog = np.mean(pog, axis=0)
    #print 'before train ----------pog', pog
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'before train ----------pog', pog
    '''
    
    while (epoch < n_epochs):
        epoch = epoch + 1
        tmp1 = time.clock()
        preW = None
        currentW = None
        minibatch_avg_cost = 0
        train_error = 0
        if (epoch - record) >= 7:
            record = epoch
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))
            print '---------', epoch, l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,'---------', epoch, l_r.get_value()
        '''
        decay_epoch = epoch - test_epochs
        if decay_epoch > 0 and decay_epoch % 30==0:
            l_r.set_value(np.cast['float32'](l_r.get_value()/3.0))
            print '---------', epoch, l_r.get_value()
            with open(logdir+'hook.txt', 'a') as f:
                print >>f,'---------', epoch, l_r.get_value()
        '''

        if epoch%5==0:   
            ''' 
            for i in xrange(n_train_batches):
                if i == 0:
                    train_features = np.asarray(train_activations(i))
                else:
                    train_features = np.vstack((train_features, np.asarray(train_activations(i))))
            for i in xrange(n_test_batches):
                if i == 0:
                    test_features = np.asarray(test_activations(i))
                else:
                    test_features = np.vstack((test_features, np.asarray(test_activations(i))))
            
            np.save(logdir+'train_features-'+str(epoch), train_features)
            np.save(logdir+'test_features-'+str(epoch), test_features)
            '''
            model = parameters()
            for i in xrange(len(model)):
                model[i] = np.asarray(model[i]).astype(np.float32)

            np.savez(logdir+'svhn_model-'+str(epoch), model=model)
            
        for minibatch_index in xrange(n_train_batches):
            
            if (minibatch_index <11):
                preW = currentW
                currentW = parameters()
                for i in xrange(len(currentW)):
                    currentW[i] = np.asarray(currentW[i]).astype(np.float32)

                if preW is not None:
                    for (c,p) in zip(currentW, preW):
                        #print minibatch_index, (c**2).mean(), ((c-p)**2).mean(), np.sqrt((c**2).mean()/((c-p)**2).mean())
                        with open(logdir+'delta_w.txt', 'a') as f:
                            print >>f,minibatch_index, (c**2).mean(), ((c-p)**2).mean(), np.sqrt((c**2).mean()/((c-p)**2).mean())
                    
            co, te = train_model(minibatch_index)
            minibatch_avg_cost+=co
            train_error+=te
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                print epoch, minibatch_index
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, epoch, minibatch_index
                print 'Stochastic hinge loss and training error', minibatch_avg_cost / float(minibatch_index), train_error / float(minibatch_index)
                #print 'time', time.clock() - tmp1
                with open(logdir+'hook.txt', 'a') as f:
                #    print >>f, 'pog', pog
                    print >>f,'Stochastic hinge loss and training error', minibatch_avg_cost / float(minibatch_index), train_error / float(minibatch_index)
                    #print >>f,'time', time.clock() - tmp1

                this_valid_losses = [valid_model(i) for i in xrange(n_valid_batches)]
                this_valid_score = np.mean(this_valid_losses)

                print(
                    'epoch %i, minibatch %i/%i, valid error %f %%' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        #this_validation_loss * 100,
                        this_valid_score *100.
                    )
                )
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, (
                        'epoch %i, minibatch %i/%i, valid error %f %%' %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            #this_validation_loss * 100,
                            this_valid_score *100.
                        )
                    )
                if this_valid_score < best_valid_score:
                    this_test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    this_test_score = np.mean(this_test_losses)
                    best_valid_score = this_valid_score
                    test_score = this_test_score
                    best_epoch = epoch
                    record = epoch
                    print 'Update best model', this_test_score
                    with open(logdir+'hook.txt', 'a') as f:
                        print >>f,'Update best model', this_test_score
                print 'So far best model', best_epoch, test_score
                with open(logdir+'hook.txt', 'a') as f:
                    print >>f, 'So far best model', best_epoch, test_score

        pogzero = np.asarray(paramovergrad(0))
        #print 'pogzero', pogzero
        with open(logdir+'pog.txt', 'a') as f:
            print >>f, 'pogzero', pogzero
            
        #pog = [paramovergrad(i) for i in xrange(n_train_batches)]
        #pog = np.mean(pog, axis=0)
        #print 'pog', pog

    print 'So far best model', test_score
    with open(logdir+'hook.txt', 'a') as f:
        print >>f, 'So far best model', test_score
        
    end_time = time.clock()
    print 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))