def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tc': (0, nsamples_train)}, 'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tc': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def save_typevecmatrix(t2idx, dsdir, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim dsdir += '_typematrix.npy' numpy.save(dsdir, numpy.transpose(typevecmatrix))
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tc': (0, nsamples_train) }, 'dev': { 'tc': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tc': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tcwords': (0, nsamples_train)}, 'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def save_typevecmatrix(t2idx, dsdir, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim dsdir += '_typematrix.npy' numpy.save(dsdir, numpy.transpose(typevecmatrix))
testfile=sys.argv[2] outf=sys.argv[3] use_tanh_out = False outputtype = config['outtype'] #hinge or softmax usetypecosine = False if 'typecosine' in config: usetypecosine = utils.str_to_bool(config['typecosine']) (t2ind, n_targets, wordvectors, vectorsize, typefreq_traindev) = utils.loadTypesAndVectors(targetTypesFile, vectorFile) (rvt, input_matrix_test, iet,resvectstnall, ntrn) = utils.fillOnlyEntityData(testfile,vectorsize, wordvectors, t2ind, n_targets, upto=-1, ds='test', binoutvec=True) # train network rng = numpy.random.RandomState(23455) if usetypecosine: print 'using cosine(e,t) as another input feature' typevecmatrix = utils.buildtypevecmatrix(t2ind, wordvectors, vectorsize) # a matrix with size: 102 * dim e2simmatrix_test = utils.buildcosinematrix(input_matrix_test, typevecmatrix) input_matrix_test = utils.extend_in_matrix(input_matrix_test, e2simmatrix_test) dt = theano.config.floatX # @UndefinedVariable index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.imatrix('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' rng = numpy.random.RandomState(23455) layer1 = layers.HiddenLayer(rng, input=x, n_in=input_matrix_test.shape[1],n_out=num_of_hidden_units, activation=T.tanh)
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab( trnMentions + devMentions + tstMentions ) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tcwords': (0, nsamples_train) }, 'dev': { 'tcwords': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tcwords': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
typefreq_traindev) = utils.loadTypesAndVectors(targetTypesFile, vectorFile) (rvt, input_matrix_test, iet, resvectstnall, ntrn) = utils.fillOnlyEntityData(testfile, vectorsize, wordvectors, t2ind, n_targets, upto=-1, ds='test', binoutvec=True) # train network rng = numpy.random.RandomState(23455) if usetypecosine: print 'using cosine(e,t) as another input feature' typevecmatrix = utils.buildtypevecmatrix( t2ind, wordvectors, vectorsize) # a matrix with size: 102 * dim e2simmatrix_test = utils.buildcosinematrix(input_matrix_test, typevecmatrix) input_matrix_test = utils.extend_in_matrix(input_matrix_test, e2simmatrix_test) dt = theano.config.floatX # @UndefinedVariable index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.imatrix('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model'