Exemplo n.º 1
0
def find_sent_embedding(n_words=21102, img_w=300, img_h=48, feature_maps=200, 
    filter_hs=[3,4,5],n_x=300, n_h=600):

    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs
    options['n_x'] = n_x
    options['n_h'] = n_h
    
    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
        
    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes
    
    params = init_params(options)
    tparams = init_tparams(params)
    
    data = np.load('./bookcorpus_result.npz')  
    
    for kk, pp in params.iteritems():
        params[kk] = data[kk]
    
    for kk, pp in params.iteritems():
        tparams[kk].set_value(params[kk])

    x = tensor.matrix('x', dtype='int32')
    
    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) 
 
    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams, layer0_input,filter_shape, pool_size,prefix=_p('cnn_encoder',i))                          
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs,1)
                                 
    f_embed = theano.function([x], layer1_input, name='f_embed')
    
    return f_embed, params
Exemplo n.º 2
0
def find_sent_embedding(whole,
                        n_words=21102,
                        img_w=300,
                        img_h=48,
                        feature_maps=200,
                        filter_hs=[3, 4, 5],
                        n_x=300,
                        n_h=600):

    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs
    options['n_x'] = n_x
    options['n_h'] = n_h

    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))

    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes

    params = init_params(options)
    tparams = init_tparams(params)

    data = np.load('./bookcorpus_result.npz')

    for kk, pp in params.iteritems():
        params[kk] = data[kk]

    for kk, pp in params.iteritems():
        tparams[kk].set_value(params[kk])

    x = tensor.matrix('x', dtype='int32')

    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),
                                               dtype='int32')].reshape(
                                                   (x.shape[0], 1, x.shape[1],
                                                    tparams['Wemb'].shape[1]))

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape,
                             pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs, 1)

    f_embed = theano.function([x], layer1_input, name='f_embed')

    kf = get_minibatches_idx(len(whole), 100)
    sent_emb = np.zeros((len(whole), 600))

    for i, train_index in kf:
        sents = [whole[t] for t in train_index]
        x = prepare_data_for_cnn(sents)
        sent_emb[train_index[0]:train_index[-1] + 1] = f_embed(x)
        if i % 500 == 0:
            print i,

    np.savez('./bookcorpus_embedding.npz', sent_emb=sent_emb)

    return sent_emb
Exemplo n.º 3
0
def train_model(train, val, test, n_words=21103, img_w=300, max_len=40, 
    feature_maps=200, filter_hs=[3,4,5], n_x=300, n_h=600, 
    max_epochs=8, lrate=0.0002, batch_size=64, valid_batch_size=64, dispFreq=10, 
    validFreq=500, saveFreq=1000, saveto = 'bookcorpus_result.npz'):
        
    """ train, valid, test : datasets
        n_words : vocabulary size
        img_w : word embedding dimension, must be 300.
        max_len : the maximum length of a sentence 
        feature_maps : the number of feature maps we used 
        filter_hs: the filter window sizes we used
        n_x: word embedding dimension
        n_h: the number of hidden units in LSTM        
        max_epochs : the maximum number of epoch to run
        lrate : learning rate
        batch_size : batch size during training
        valid_batch_size : The batch size used for validation/test set
        dispFreq : Display to stdout the training progress every N updates
        validFreq : Compute the validation error after this number of update.
        saveFreq: save the result after this number of update.
        saveto: where to save the result.
    """
    
    img_h = max_len + 2*(filter_hs[-1]-1)
    
    options = {}
    options['n_words'] = n_words
    options['img_w'] = img_w
    options['img_h'] = img_h
    options['feature_maps'] = feature_maps
    options['filter_hs'] = filter_hs
    options['n_x'] = n_x
    options['n_h'] = n_h
    options['max_epochs'] = max_epochs
    options['lrate'] = lrate
    options['batch_size'] = batch_size
    options['valid_batch_size'] = valid_batch_size
    options['dispFreq'] = dispFreq
    options['validFreq'] = validFreq
    options['saveFreq'] = saveFreq
   
    logger.info('Model options {}'.format(options))

    logger.info('Building model...')
    
    filter_w = img_w
    filter_shapes = []
    pool_sizes = []
    for filter_h in filter_hs:
        filter_shapes.append((feature_maps, 1, filter_h, filter_w))
        pool_sizes.append((img_h-filter_h+1, img_w-filter_w+1))
        
    options['filter_shapes'] = filter_shapes
    options['pool_sizes'] = pool_sizes
    
    params = init_params(options)
    tparams = init_tparams(params)

    use_noise, x, y, y_mask, cost = build_model(tparams,options)
    
    f_cost = theano.function([x, y, y_mask], cost, name='f_cost')
    
    lr = tensor.scalar(name='lr')
    f_grad_shared, f_update = Adam(tparams, cost, [x, y, y_mask], lr)
    
    logger.info('Training model...')
    
    history_cost = []  
    uidx = 0  # the number of update done
    start_time = time.time()
    
    kf_valid = get_minibatches_idx(len(val), valid_batch_size)
    
    zero_vec_tensor = tensor.vector()
    zero_vec = np.zeros(img_w).astype(theano.config.floatX)
    set_zero = theano.function([zero_vec_tensor], updates=[(tparams['Wemb'], tensor.set_subtensor(tparams['Wemb'][21102,:], zero_vec_tensor))])
    
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0
            
            kf = get_minibatches_idx(len(train), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(0.)

                sents = [train[t]for t in train_index]
                
                x = prepare_data_for_cnn(sents)
                y, y_mask = prepare_data_for_rnn(sents)
                n_samples += y.shape[1]

                cost = f_grad_shared(x, y, y_mask)
                f_update(lrate)
                # the special <pad_zero> token does not need to update.
                set_zero(zero_vec)

                if np.isnan(cost) or np.isinf(cost):
                    
                    logger.info('NaN detected')
                    return 1., 1., 1.

                if np.mod(uidx, dispFreq) == 0:
                    logger.info('Epoch {} Update {} Cost {}'.format(eidx, uidx, np.exp(cost)))
                
                if np.mod(uidx, saveFreq) == 0:
                    
                    logger.info('Saving ...')
                    
                    params = unzip(tparams)
                    np.savez(saveto, history_cost=history_cost, **params)
                    
                    logger.info('Done ...')

                if np.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    
                    valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid)
                    history_cost.append([valid_cost])
                        
                    logger.info('Valid {}'.format(np.exp(valid_cost)))

        logger.info('Seen {} samples'.format(n_samples))

    except KeyboardInterrupt:
        logger.info('Training interupted')

    end_time = time.time()
    
#    if best_p is not None:
#        zipp(best_p, tparams)
#    else:
#        best_p = unzip(tparams)
    
    
    use_noise.set_value(0.)
    valid_cost = calu_cost(f_cost, prepare_data_for_cnn, prepare_data_for_rnn, val, kf_valid)
    logger.info('Valid {}'.format(np.exp(valid_cost)))
    
    params = unzip(tparams)
    np.savez(saveto, history_cost=history_cost, **params)

    
    logger.info('The code run for {} epochs, with {} sec/epochs'.format(eidx + 1, 
                 (end_time - start_time) / (1. * (eidx + 1))))
    
    
    return valid_cost