Пример #1
0
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)
    
    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tc': (0, nsamples_train)},
        'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tc': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
Пример #2
0
def build_typecosine_ds(trnMentions,
                        devMentions,
                        tstMentions,
                        t2idx,
                        hdf5_file,
                        vectorfile,
                        upto=-1):
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)
    input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        mye = men.entityId
        entvec = numpy.zeros(vectorsize)
        if mye in voc2idx:
            entvec = embeddings[voc2idx[mye]]
        input_entvec[i] = entvec
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix)
    logger.info(ent_types_cosin_matrix.shape)

    hdf5_file += '_tc.h5py'
    f = h5py.File(hdf5_file, mode='w')
    features = f.create_dataset('tc',
                                ent_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1]
    features[...] = ent_types_cosin_matrix
    features.dims[0].label = 'types_ent_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tc': (0, nsamples_train)
        },
        'dev': {
            'tc': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tc': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-ent cosine (tc) dataset finished. It saved in: %s',
        hdf5_file)
Пример #3
0
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions) 
    
    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto)
    
    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec
    
    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)
     
    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions); nsamples_dev = len(devMentions);
    split_dict = {
        'train': {'tcwords': (0, nsamples_train)},
        'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 
        'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}}    
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
Пример #4
0
outf=sys.argv[3]
use_tanh_out = False
outputtype = config['outtype'] #hinge or softmax
usetypecosine = False
if 'typecosine' in config:
    usetypecosine = utils.str_to_bool(config['typecosine'])

(t2ind, n_targets, wordvectors, vectorsize, typefreq_traindev) = utils.loadTypesAndVectors(targetTypesFile, vectorFile)
(rvt, input_matrix_test, iet,resvectstnall, ntrn) = utils.fillOnlyEntityData(testfile,vectorsize, wordvectors, t2ind, n_targets, upto=-1, ds='test', binoutvec=True)

# train network
rng = numpy.random.RandomState(23455)
if usetypecosine:
    print 'using cosine(e,t) as another input feature'
    typevecmatrix = utils.buildtypevecmatrix(t2ind, wordvectors, vectorsize) # a matrix with size: 102 * dim 
    e2simmatrix_test = utils.buildcosinematrix(input_matrix_test, typevecmatrix)
    input_matrix_test = utils.extend_in_matrix(input_matrix_test, e2simmatrix_test)

dt = theano.config.floatX  # @UndefinedVariable

index = T.lscalar()  # index to a [mini]batch
x = T.matrix('x')  # the data is presented as rasterized images
y = T.imatrix('y')  # the labels are presented as 1D vector of
                        # [int] labels
######################
# BUILD ACTUAL MODEL #
######################
print '... building the model'
rng = numpy.random.RandomState(23455)
layer1 = layers.HiddenLayer(rng, input=x, n_in=input_matrix_test.shape[1],n_out=num_of_hidden_units, activation=T.tanh)
Пример #5
0
def build_type_words_cosine_ds(trnMentions,
                               devMentions,
                               tstMentions,
                               t2idx,
                               dsdir,
                               vectorfile,
                               upto=-1,
                               max_num_words=4):
    word_to_idx, idx_to_word = build_word_vocab(
        trnMentions + devMentions + tstMentions
    )  #train for characters because we only use entities names for characters
    logger.info('word vocab size: %d', len(word_to_idx))
    totals = len(trnMentions) + len(devMentions) + len(tstMentions)

    idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile,
                                                       vocab=word_to_idx,
                                                       num=upto)

    input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32')
    for i, men in enumerate(trnMentions + devMentions + tstMentions):
        name = men.name
        words = name.split()
        seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words)
        avgvec = numpy.zeros(shape=(vectorsize))
        for ii in seq_words:
            avgvec += idx2embeddings[ii]
        avgvec /= len(seq_words)
        input_avg[i] = avgvec

    (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto)
    typevecmatrix = buildtypevecmatrix(
        t2idx, embeddings, vectorsize,
        voc2idx)  # a matrix with size: 102 * dim
    words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix)
    logger.info(words_types_cosin_matrix.shape)

    dsdir += '_tcwords.h5py'
    f = h5py.File(dsdir, mode='w')
    features = f.create_dataset('tcwords',
                                words_types_cosin_matrix.shape,
                                dtype='float32')  # @UndefinedVariable
    features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1]
    features[...] = words_types_cosin_matrix
    features.dims[0].label = 'words_types_cosine'
    nsamples_train = len(trnMentions)
    nsamples_dev = len(devMentions)
    split_dict = {
        'train': {
            'tcwords': (0, nsamples_train)
        },
        'dev': {
            'tcwords': (nsamples_train, nsamples_train + nsamples_dev)
        },
        'test': {
            'tcwords': (nsamples_train + nsamples_dev, totals)
        }
    }
    f.attrs['split'] = H5PYDataset.create_split_array(split_dict)
    f.flush()
    f.close()
    logger.info(
        'Building types-words cosine (tcwords) dataset finished. It saved in: %s',
        dsdir)
Пример #6
0
 ntrn) = utils.fillOnlyEntityData(testfile,
                                  vectorsize,
                                  wordvectors,
                                  t2ind,
                                  n_targets,
                                  upto=-1,
                                  ds='test',
                                  binoutvec=True)

# train network
rng = numpy.random.RandomState(23455)
if usetypecosine:
    print 'using cosine(e,t) as another input feature'
    typevecmatrix = utils.buildtypevecmatrix(
        t2ind, wordvectors, vectorsize)  # a matrix with size: 102 * dim
    e2simmatrix_test = utils.buildcosinematrix(input_matrix_test,
                                               typevecmatrix)
    input_matrix_test = utils.extend_in_matrix(input_matrix_test,
                                               e2simmatrix_test)

dt = theano.config.floatX  # @UndefinedVariable

index = T.lscalar()  # index to a [mini]batch
x = T.matrix('x')  # the data is presented as rasterized images
y = T.imatrix('y')  # the labels are presented as 1D vector of
# [int] labels
######################
# BUILD ACTUAL MODEL #
######################
print '... building the model'
rng = numpy.random.RandomState(23455)
layer1 = layers.HiddenLayer(rng,
Пример #7
0
if 'typecosine' in config:
    usetypecosine = utils.str_to_bool(config['typecosine'])
    
upto = -1
(t2ind, n_targets, wordvectors, vectorsize, typefreq_traindev) = utils.loadTypesAndVectors(targetTypesFile, vectorFile, upto=upto)

(rvt, input_matrix_train, iet,resvectrnall, ntrn) = utils.fillOnlyEntityData(trainfile,vectorsize, wordvectors, t2ind, n_targets, upto=upto, binoutvec=True)
print "number of training examples:" + str(len(iet))

(rvd, input_matrix_dev, ied,resvecdevall, ntdev) = utils.fillOnlyEntityData(devfile,vectorsize, wordvectors, t2ind, n_targets, upto=upto, binoutvec=True)
print "number of validation examples:" +  str(len(ied))

if usetypecosine:
    print 'using cosine(e,t) as another input feature'
    typevecmatrix = utils.buildtypevecmatrix(t2ind, wordvectors, vectorsize) # a matrix with size: 102 * dim 
    e2simmatrix_train = utils.buildcosinematrix(input_matrix_train, typevecmatrix)
    e2simmatrix_dev = utils.buildcosinematrix(input_matrix_dev, typevecmatrix)
    input_matrix_train = utils.extend_in_matrix(input_matrix_train, e2simmatrix_train)
    input_matrix_dev = utils.extend_in_matrix(input_matrix_dev, e2simmatrix_dev)

rng = numpy.random.RandomState(23455)

dt = theano.config.floatX  # @UndefinedVariable
train_set_x = theano.shared(numpy.matrix(input_matrix_train, dtype=dt))  # @UndefinedVariable
valid_set_x = theano.shared(numpy.matrix(input_matrix_dev, dtype=dt))
train_set_y = theano.shared(numpy.matrix(resvectrnall, dtype=numpy.dtype(numpy.int32)))
valid_set_y = theano.shared(numpy.matrix(resvecdevall, dtype=numpy.dtype(numpy.int32)))

n_train_batches = train_set_x.get_value(borrow=True).shape[0]
n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
n_train_batches /= batch_size