def prepare_elmo_features(path, dataset, vocab_file, options_file, weight_file): """ Dump the embeddings to a file. Parameters ---------- path: str dataset: str vocab_file: str options_file: str weight_file: str """ embedding_file = os.path.join(path, "X_elmo_" + dataset + ".hdf5") dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, embedding_file)
def process_batch(self, sentences): tokenized_context = [ sentence.strip().split() for sentence in sentences ] freq_map = {} for i in range(0, len(tokenized_context)): for j in range(0, len(tokenized_context[i])): key = tokenized_context[i][j] if key in freq_map: freq_map[key] = freq_map[key] + 1.0 else: freq_map[key] = 1.0 embedding_map = dump_bilm_embeddings(self.vocab_file, sentences, self.options_file, self.weight_file) ret_map = {} for sent_id in range(0, len(sentences)): sent_embedding = embedding_map[sent_id] for i in range(0, len(tokenized_context[sent_id])): key = tokenized_context[sent_id][i] concat = np.concatenate([ sent_embedding[0][i], sent_embedding[1][i], sent_embedding[2][i] ]) if key in ret_map: ret_map[key] = ret_map[key] + concat else: ret_map[key] = concat assert (len(ret_map[key]) == 3 * 1024) ret_map_avg = {} for key in ret_map: dividend = freq_map[key] ret_map_avg[key] = list(ret_map[key] / dividend) tf.reset_default_graph() return ret_map_avg
help='Minibatch size of computation') parser.add_argument('--input', '-in', '-i', required=True, help='Path of input text file') parser.add_argument('--output', '-out', '-o', required=True, help='Path of output file to be written') args = parser.parse_args() print(json.dumps(args.__dict__, indent=2)) # Location of pretrained LM. vocab_file = 'vocab-2016-09-10.txt' options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' dataset_file = args.input embedding_file = args.output assert args.input != args.output dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, embedding_file, gpu=args.gpu, batchsize=args.batchsize)
# Our small dataset. raw_context = [ 'Pretrained biLMs compute representations useful for NLP tasks .', 'They give state of the art performance for many tasks .' ] tokenized_context = [sentence.split() for sentence in raw_context] tokenized_question = [ ['What', 'are', 'biLMs', 'useful', 'for', '?'], ] # Create the dataset file. dataset_file = 'dataset_file.txt' with open(dataset_file, 'w') as fout: for sentence in tokenized_context + tokenized_question: fout.write(' '.join(sentence) + '\n') # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('tests', 'fixtures', 'model') vocab_file = os.path.join(datadir, 'vocab_test.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'lm_weights.hdf5') # Dump the embeddings to a file. Run this once for your dataset. embedding_file = 'elmo_embeddings.hdf5' dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file, embedding_file) # Load the embeddings from the file -- here the 2nd sentence. with h5py.File(embedding_file, 'r') as fin: second_sentence_embeddings = fin['1'][...]
'They give state of the art performance for many tasks .' ] tokenized_context = [sentence.split() for sentence in raw_context] tokenized_question = [ ['What', 'are', 'biLMs', 'useful', 'for', '?'], ] # Create the dataset file. dataset_file = 'dataset_file.txt' with open(dataset_file, 'w') as fout: for sentence in tokenized_context + tokenized_question: fout.write(' '.join(sentence) + '\n') # Location of pretrained LM. Here we use the test fixtures. datadir = os.path.join('tests', 'fixtures', 'model') vocab_file = os.path.join(datadir, 'vocab_test.txt') options_file = os.path.join(datadir, 'options.json') weight_file = os.path.join(datadir, 'lm_weights.hdf5') # Dump the embeddings to a file. Run this once for your dataset. embedding_file = 'elmo_embeddings.hdf5' dump_bilm_embeddings( vocab_file, dataset_file, options_file, weight_file, embedding_file ) # Load the embeddings from the file -- here the 2nd sentence. with h5py.File(embedding_file, 'r') as fin: second_sentence_embeddings = fin['1'][...]
fout.write(' '.join(sentence) + '\n') # Location of pretrained LM. vocab_file = 'vocab-2016-09-10.txt' options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json' weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5' # Dump the embeddings to a file. Run this once for your dataset. embedding_file = 'elmo_embeddings.hdf5' # gpu id # if you want to use cpu, set gpu=-1 gpu = -1 # batchsize # encoding each token is inefficient # encoding too many tokens is difficult due to memory batchsize = 32 dump_bilm_embeddings( vocab_file, dataset_file, options_file, weight_file, embedding_file, gpu=gpu, batchsize=batchsize ) # Load the embeddings from the file -- here the 2nd sentence. with h5py.File(embedding_file, 'r') as fin: second_sentence_embeddings = fin['1'][...] print(second_sentence_embeddings.shape) # (n_layers=3, sequence_length, embedding_dim) print(second_sentence_embeddings)
# for sentence in data: # fout.write(sentence + '\n') # Location of pretrained LM. Here we use the test fixtures. model_dir = '/crs_elmo/bilm-tf/model/official/small' vocab_file = os.path.join(model_dir, 'vocab-2016-09-10.txt') elmo_options_file = os.path.join( model_dir, 'elmo_2x1024_128_2048cnn_1xhighway_options.json') elmo_weight_file = os.path.join( model_dir, 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5') data_dir = '/crs_elmo/downstream_data/XNLI' max_seq_len = 50 # Dump the embeddings to a file. Run this once for your dataset. embedding_files = [ 'train_elmo_a.hdf5', 'train_elmo_b.hdf5', 'dev_elmo_a.hdf5', 'dev_elmo_b.hdf5' ] # for dataset_file, embedding_file in zip(dataset_files, embedding_files): dataset_file = dataset_files[1] embedding_file = embedding_files[1] print(dataset_file, embedding_file) dataset_file = os.path.join(data_dir, dataset_file) dump_bilm_embeddings(vocab_file, dataset_file, elmo_options_file, elmo_weight_file, embedding_file, max_seq_len) # Load the embeddings from the file -- here the 2nd sentence. # with h5py.File(os.path.join(data_dir, embedding_files[0]), 'r') as fin: # print("shape: ", fin.shape) # print(fin['0'])