def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tc': (0, nsamples_train)}, 'dev': {'tc': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tc': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def save_typevecmatrix(t2idx, dsdir, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim dsdir += '_typematrix.npy' numpy.save(dsdir, numpy.transpose(typevecmatrix))
def build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, hdf5_file, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_entvec = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): mye = men.entityId entvec = numpy.zeros(vectorsize) if mye in voc2idx: entvec = embeddings[voc2idx[mye]] input_entvec[i] = entvec typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim ent_types_cosin_matrix = buildcosinematrix(input_entvec, typevecmatrix) logger.info(ent_types_cosin_matrix.shape) hdf5_file += '_tc.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('tc', ent_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = ent_types_cosin_matrix.shape[1] features[...] = ent_types_cosin_matrix features.dims[0].label = 'types_ent_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tc': (0, nsamples_train) }, 'dev': { 'tc': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tc': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-ent cosine (tc) dataset finished. It saved in: %s', hdf5_file)
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tcwords': (0, nsamples_train)}, 'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def save_typevecmatrix(t2idx, dsdir, vectorfile, upto=-1): (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim dsdir += '_typematrix.npy' numpy.save(dsdir, numpy.transpose(typevecmatrix))
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab( trnMentions + devMentions + tstMentions ) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tcwords': (0, nsamples_train) }, 'dev': { 'tcwords': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tcwords': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)