def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None): if vectorfile == None: return word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words) logger.info('shape of subwords dataset: %s', input_words.shape) hdf5_file = dsdir + '_subwords.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('subwords', input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_words features.dims[0].label = 'words' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'subwords': (0, nsamples_train)}, 'dev': {'subwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'subwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file) logger.info('writing subword embeddings') idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30): char_to_idx, idx_to_char = build_char_vocab(trnMentions) #train for characters because we only use entities names for characters totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name) print input_letters.shape fuelfile = dsdir +'_letters.h5py' f = h5py.File(fuelfile, mode='w') features = f.create_dataset('letters', input_letters.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False) features.attrs['vocabsize'] = len(char_to_idx) features[...] = input_letters features.dims[0].label = 'letters' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'letters': (0, nsamples_train)}, 'dev': {'letters': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'letters': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('building letters dataset finished. It saved in: %s', fuelfile) if vectorfile is None: return embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile=None, max_len_name=30): char_to_idx, idx_to_char = build_char_vocab( trnMentions ) #train for characters because we only use entities names for characters totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_letters = numpy.zeros(shape=(totals, max_len_name), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name input_letters[i] = get_ngram_seq(char_to_idx, name, max_len_name) print input_letters.shape fuelfile = dsdir + '_letters.h5py' f = h5py.File(fuelfile, mode='w') features = f.create_dataset('letters', input_letters.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(char_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_char, default_flow_style=False) features.attrs['vocabsize'] = len(char_to_idx) features[...] = input_letters features.dims[0].label = 'letters' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'letters': (0, nsamples_train) }, 'dev': { 'letters': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'letters': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('building letters dataset finished. It saved in: %s', fuelfile) if vectorfile is None: return embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=char_to_idx, num=-1) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_letters_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None): if ent2tfidf_features_path == None: print "Warning: ignoring tfidf features building..." return ent2features = load_ent2features(ent2tfidf_features_path) word_to_idx, idx_to_word = build_voc_from_features(ent2features) logger.info('tfidf desc features vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32') ent_no_emb = 0 for i, men in enumerate(trnMentions + devMentions + tstMentions): if men.entityId not in ent2features: ent_no_emb += 1 continue features = ent2features[men.entityId] input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1]) logger.info('shape of tfidf input dataset: %s', input_features.shape) logger.info('number of entities without embeddings: %d', ent_no_emb) hdf5_file = dsdir + '_desc_features.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('desc_features', input_features.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_features features.dims[0].label = 'description_features' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'desc_features': (0, nsamples_train)}, 'dev': {'desc_features': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'desc_features': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file) logger.info('writing word embeddings') idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) print "embeddings shape: ", idx2embeddings.shape with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab(trnMentions+devMentions+tstMentions) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix(t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {'tcwords': (0, nsamples_train)}, 'dev': {'tcwords': (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {'tcwords': (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1): ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab(trnMentions+devMentions+tstMentions,ngram=ngram, MIN_FREQ=5) #train for characters because we only use entities names for characters logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name ngrams = name2ngrams[name] input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams) print input_words.shape ngram_label = 'ngrams' + str(ngram) hdf5_file = dsdir + '_ngrams'+str(ngram)+'.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset(ngram_label, input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(ngram_to_idx) features[...] = input_words features.dims[0].label = ngram_label nsamples_train = len(trnMentions); nsamples_dev = len(devMentions); split_dict = { 'train': {ngram_label: (0, nsamples_train)}, 'dev': {ngram_label: (nsamples_train, nsamples_train + nsamples_dev)}, 'test': {ngram_label: (nsamples_train + nsamples_dev, totals)}} f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush();f.close() logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file) if vectorfile is None or vectorfile == '': return logger.info('Now, writing ngram embeddings') embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, upto=-1, max_num_words=4): word_to_idx, idx_to_word = build_word_vocab( trnMentions + devMentions + tstMentions ) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) idx2embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=word_to_idx, num=upto) input_avg = numpy.zeros(shape=(totals, vectorsize), dtype='float32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() seq_words = get_ngram_seq(word_to_idx, words, max_len=max_num_words) avgvec = numpy.zeros(shape=(vectorsize)) for ii in seq_words: avgvec += idx2embeddings[ii] avgvec /= len(seq_words) input_avg[i] = avgvec (embeddings, voc2idx, vectorsize) = read_embeddings(vectorfile, upto) typevecmatrix = buildtypevecmatrix( t2idx, embeddings, vectorsize, voc2idx) # a matrix with size: 102 * dim words_types_cosin_matrix = buildcosinematrix(input_avg, typevecmatrix) logger.info(words_types_cosin_matrix.shape) dsdir += '_tcwords.h5py' f = h5py.File(dsdir, mode='w') features = f.create_dataset('tcwords', words_types_cosin_matrix.shape, dtype='float32') # @UndefinedVariable features.attrs['vectorsize'] = words_types_cosin_matrix.shape[1] features[...] = words_types_cosin_matrix features.dims[0].label = 'words_types_cosine' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'tcwords': (0, nsamples_train) }, 'dev': { 'tcwords': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'tcwords': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info( 'Building types-words cosine (tcwords) dataset finished. It saved in: %s', dsdir)
def build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, ngram, max_num_ngrams=98, upto=-1): ngram_to_idx, idx_to_word, name2ngrams = build_ngram_vocab( trnMentions + devMentions + tstMentions, ngram=ngram, MIN_FREQ=5 ) #train for characters because we only use entities names for characters logger.info('ngram%d vocab size: %d', ngram, len(ngram_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_ngrams), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name ngrams = name2ngrams[name] input_words[i] = get_ngram_seq(ngram_to_idx, ngrams, max_len=max_num_ngrams) print input_words.shape ngram_label = 'ngrams' + str(ngram) hdf5_file = dsdir + '_ngrams' + str(ngram) + '.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset(ngram_label, input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(ngram_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(ngram_to_idx) features[...] = input_words features.dims[0].label = ngram_label nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { ngram_label: (0, nsamples_train) }, 'dev': { ngram_label: (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { ngram_label: (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building ngram%d dataset finished. It saved in: %s', ngram, hdf5_file) if vectorfile is None or vectorfile == '': return logger.info('Now, writing ngram embeddings') embeddings, vectorsize = read_embeddings_vocab(vectorfile, vocab=ngram_to_idx, num=upto) logger.info('size of embedding matrix to save is: (%d, %d)', embeddings.shape[0], embeddings.shape[1]) with h5py.File(dsdir + "_" + ngram_label + "_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorfile, use_lowercase=False, max_num_words=10, upto=None): if vectorfile == None: return word_to_idx, idx_to_word = build_word_vocab( trnMentions + devMentions + tstMentions ) #train for characters because we only use entities names for characters logger.info('word vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_words = numpy.zeros(shape=(totals, max_num_words), dtype='int32') for i, men in enumerate(trnMentions + devMentions + tstMentions): name = men.name words = name.split() input_words[i] = get_ngram_seq(word_to_idx, words, max_len=max_num_words) logger.info('shape of subwords dataset: %s', input_words.shape) hdf5_file = dsdir + '_subwords.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('subwords', input_words.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_words features.dims[0].label = 'words' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'subwords': (0, nsamples_train) }, 'dev': { 'subwords': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'subwords': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building subwords dataset finished. It saved in: %s', hdf5_file) logger.info('writing subword embeddings') idx2embeddings, vectorsize = read_embeddings_vocab( vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) with h5py.File(dsdir + "_subwords_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize
def build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorfile, use_lowercase=True, upto=None): if ent2tfidf_features_path == None: print "Warning: ignoring tfidf features building..." return ent2features = load_ent2features(ent2tfidf_features_path) word_to_idx, idx_to_word = build_voc_from_features(ent2features) logger.info('tfidf desc features vocab size: %d', len(word_to_idx)) totals = len(trnMentions) + len(devMentions) + len(tstMentions) input_features = numpy.zeros(shape=(totals, len(ent2features.values()[0])), dtype='int32') ent_no_emb = 0 for i, men in enumerate(trnMentions + devMentions + tstMentions): if men.entityId not in ent2features: ent_no_emb += 1 continue features = ent2features[men.entityId] input_features[i] = get_ngram_seq(word_to_idx, features, max_len=input_features.shape[1]) logger.info('shape of tfidf input dataset: %s', input_features.shape) logger.info('number of entities without embeddings: %d', ent_no_emb) hdf5_file = dsdir + '_desc_features.h5py' f = h5py.File(hdf5_file, mode='w') features = f.create_dataset('desc_features', input_features.shape, dtype='int32') # @UndefinedVariable features.attrs['voc2idx'] = yaml.dump(word_to_idx, default_flow_style=False) features.attrs['idx2voc'] = yaml.dump(idx_to_word, default_flow_style=False) features.attrs['vocabsize'] = len(word_to_idx) features[...] = input_features features.dims[0].label = 'description_features' nsamples_train = len(trnMentions) nsamples_dev = len(devMentions) split_dict = { 'train': { 'desc_features': (0, nsamples_train) }, 'dev': { 'desc_features': (nsamples_train, nsamples_train + nsamples_dev) }, 'test': { 'desc_features': (nsamples_train + nsamples_dev, totals) } } f.attrs['split'] = H5PYDataset.create_split_array(split_dict) f.flush() f.close() logger.info('Building desc_features dataset finished. It saved in: %s', hdf5_file) logger.info('writing word embeddings') idx2embeddings, vectorsize = read_embeddings_vocab( vectorfile, vocab=word_to_idx, use_lowercase=use_lowercase, num=upto) print "embeddings shape: ", idx2embeddings.shape with h5py.File(dsdir + "_desc_features_embeddings.h5py", mode='w') as fp: vectors = fp.create_dataset('vectors', compression='gzip', data=idx2embeddings) vectors.attrs['vectorsize'] = vectorsize