def load(self, path, debug=False): """Load from disk Parameters ---------- path : str path to the directory which typically contains a config.pkl file and a model.bin file Returns ------- DepParser parser itself """ config = _Config.load(os.path.join(path, 'config.pkl')) if debug: print(config) with open(config.save_vocab_path, 'rb') as f: self._vocab = pickle.load(f) with mx.Context(mxnet_prefer_gpu()): self._parser = self.cls_parser(self._vocab, config.word_dims, config.tag_dims, config.dropout_emb, config.lstm_layers, config.lstm_hiddens, config.dropout_lstm_input, config.dropout_lstm_hidden, config.mlp_arc_size, config.mlp_rel_size, config.dropout_mlp, bert=config.bert_dim, debug=True) self._parser.load(config.save_model_path) self._parser.rnn.pret_word_embs.initialize(ctx=mxnet_prefer_gpu()) return self
def __init__(self, model, detach: bool = True, context: mx.Context = None): super().__init__() """ Contextual string embeddings of words, as proposed in Akbik et al., 2018. Parameters ---------- arg1 : model model string, one of 'news-forward', 'news-backward', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward' depending on which character language model is desired arg2 : detach if set to false, the gradient will propagate into the language model. this dramatically slows down training and often leads to worse results, so not recommended. """ self.static_embeddings = detach self.context = context if context else mxnet_prefer_gpu() self.lm = ContextualStringModel.load_language_model(model, context=self.context) self.detach = detach if detach: self.lm.freeze() self.static_embeddings = True self.is_forward_lm = self.lm.is_forward_lm with self.context: dummy_sentence = Sentence() dummy_sentence.add_token(Token('hello')) embedded_dummy = self.embed(dummy_sentence) self.__embedding_length = len(embedded_dummy[0].get_token(1).get_embedding())
def load(self, path, debug=False): """Load from disk Parameters ---------- path : str path to the directory which typically contains a config.pkl file and a model.bin file Returns ------- SDPParser parser itself """ config: _Config = _Config.load(os.path.join(path, 'config.pkl')) if debug: print(config) self._vocab = vocab = ParserVocabulary.load(config.save_vocab_path) with mx.Context(mxnet_prefer_gpu()): self._parser = BiaffineDepParser(vocab, config.word_dims, config.tag_dims, config.dropout_emb, config.lstm_layers, config.lstm_hiddens, config.dropout_lstm_input, config.dropout_lstm_hidden, config.mlp_arc_size, config.mlp_rel_size, config.dropout_mlp, bert=config.bert_dim, debug=True) self._parser.initialize() self._parser.load(config.save_model_path) return self
def fill(self, path): super().fill(path) for i, second_decoder in enumerate(self.arc_biaffines): sd_path = os.path.join(path, 'second_decoder{}.bin'.format(i)) if os.path.isfile(sd_path): second_decoder.load_parameters(sd_path, ctx=mxnet_prefer_gpu()) freeze(second_decoder)
def fill(self, path): rnn_path = os.path.join(path, 'rnn.bin') if os.path.isfile(rnn_path): # print('load rnn') self.rnn.load_parameters(rnn_path, ctx=mxnet_prefer_gpu()) freeze(self.rnn) for i, (mlp, decoder) in enumerate(zip(self.mlps, self.decoders)): mlp_path = os.path.join(path, 'mlp{}.bin'.format(i)) if os.path.isfile(mlp_path): # print('load mlp') mlp.load_parameters(mlp_path, ctx=mxnet_prefer_gpu()) freeze(mlp) decoder_path = os.path.join(path, 'decoder{}.bin'.format(i)) if os.path.isfile(decoder_path): # print('load decoder') decoder.load_parameters(decoder_path, ctx=mxnet_prefer_gpu()) freeze(decoder)
def load(self, load_path, ctx=None): """Load model Parameters ---------- load_path : str path to model file """ if not ctx: ctx = mxnet_prefer_gpu() self.load_parameters(load_path, allow_missing=True, ctx=ctx)
def evaluate(self, test_files: List[str], save_dir=None, logger=None, num_buckets_test=10, test_batch_size=5000, bert_path=None, debug=False): """Run evaluation on test set Parameters ---------- test_files : str path to test set save_dir : str where to store intermediate results and log logger : logging.logger logger for printing results num_buckets_test : int number of clusters for sentences from test set test_batch_size : int batch size of test set Returns ------- tuple UAS, LAS """ parser = self._parser with mx.Context(mxnet_prefer_gpu()): UF, LF, speed = evaluate_joint_official_script(parser, self._vocab, num_buckets_test, test_batch_size, test_files, save_dir, bert=bert_path, debug=debug) score_str = 'Test\n' for dataset, uf, lf in zip(test_files, UF, LF): dataset = os.path.basename(dataset) uf = uf * 100 lf = lf * 100 score_str += '{} UF={:0.1f} LF={:0.1f}\n'.format( dataset, uf, lf) LF = sum(LF) / len(LF) * 100 if logger is None: logger = init_logger(save_dir, 'test.log') logger.info(score_str + '%d sents/s' % (speed)) return LF
def load_from_file(cls, model_folder, context: mx.Context = None, **kwargs): if context is None: context = mxnet_prefer_gpu() config_path = os.path.join(model_folder, 'config.pkl') with open(config_path, 'rb') as f: config = pickle.load(f) with context: embedding_types = [ WordEmbeddings( '{}data/embedding/fasttext100.vec.txt'.format( kwargs.get('word_embedding_path', ''))), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings CharLMEmbeddings('{}data/model/lm-news-forward'.format( kwargs.get('word_embedding_path', '')), context=context), CharLMEmbeddings('{}data/model/lm-news-backward'.format( kwargs.get('word_embedding_path', '')), context=context), ] embeddings = StackedEmbeddings(embeddings=embedding_types) model = SequenceTagger(hidden_size=config['hidden_size'], embeddings=embeddings, tag_dictionary=config['tag_dictionary'], tag_type=config['tag_type'], use_crf=config['use_crf'], use_rnn=config['use_rnn'], rnn_layers=config['rnn_layers']) model.load_parameters(os.path.join(model_folder, 'model.bin'), ctx=context) return model
def evaluate(self, test_file, save_dir=None, logger=None, num_buckets_test=10, test_batch_size=5000, bert_path=None): parser = self._parser vocab = self._vocab with mx.Context(mxnet_prefer_gpu()): UF, LF, speed = dep_evaluate_official_script( parser, vocab, num_buckets_test, test_batch_size, test_file, os.path.join(save_dir, 'test.predict.conll'), bert=bert_path) if logger is None: logger = init_logger(save_dir, 'test.log') logger.info('Test: UF=%.2f%% LF=%.2f%% %d sents/s' % (UF, LF, speed)) return LF
def parse(self, sentence): """Parse raw sentence into ConllSentence Parameters ---------- sentence : list a list of (word, tag) tuples Returns ------- ConllSentence ConllSentence object """ words = np.zeros((len(sentence) + 1, 1), np.int32) tags = np.zeros((len(sentence) + 1, 1), np.int32) words[0, 0] = ParserVocabulary.ROOT tags[0, 0] = ParserVocabulary.ROOT vocab = self._vocab for i, (word, tag) in enumerate(sentence): words[i + 1, 0], tags[i + 1, 0] = vocab.word2id(word.lower()), vocab.tag2id(tag) with mx.Context(mxnet_prefer_gpu()): outputs = self._parser.forward(words, tags) words = [] for arc, rel, (word, tag) in zip(outputs[0][0], outputs[0][1], sentence): words.append( ConllWord(id=len(words) + 1, form=word, pos=tag, head=arc, relation=vocab.id2rel(rel))) return ConllSentence(words)
def train(self, train_file: List[str], dev_file: List[str], save_dir, pretrained_embeddings_file=None, min_occur_count=2, lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400, dropout_lstm_input=0.33, dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100, dropout_mlp=0.33, learning_rate=1e-3, decay=.75, decay_steps=5000, beta_1=.9, beta_2=.9, epsilon=1e-12, num_buckets_train=40, num_buckets_valid=10, train_iters=50000, train_batch_size=5000, dev_batch_size=5000, validate_every=100, save_after=5000, root='root', transfer=None, bert_path=None, debug=False): """Train a deep biaffine dependency parser Parameters ---------- train_file : str path to training set dev_file : str path to dev set save_dir : str a directory for saving model and related meta-data pretrained_embeddings_file : str pre-trained embeddings file, plain text format min_occur_count : int threshold of rare words, which will be replaced with UNKs, lstm_layers : int layers of lstm word_dims : int dimension of word embedding tag_dims : int dimension of tag embedding dropout_emb : float word dropout lstm_hiddens : int size of lstm hidden states dropout_lstm_input : int dropout on x in variational RNN dropout_lstm_hidden : int dropout on h in variational RNN mlp_arc_size : int output size of MLP for arc feature extraction mlp_rel_size : int output size of MLP for rel feature extraction dropout_mlp : float dropout on the output of LSTM learning_rate : float learning rate decay : float see ExponentialScheduler decay_steps : int see ExponentialScheduler beta_1 : float see ExponentialScheduler beta_2 : float see ExponentialScheduler epsilon : float see ExponentialScheduler num_buckets_train : int number of buckets for training data set num_buckets_valid : int number of buckets for dev data set train_iters : int training iterations train_batch_size : int training batch size dev_batch_size : int test batch size validate_every : int validate on dev set every such number of batches save_after : int skip saving model in early epochs root : str token for ROOT debug : bool debug mode Returns ------- DepParser parser itself """ logger = init_logger(save_dir) config = _Config(train_file, dev_file, None, save_dir, pretrained_embeddings_file, min_occur_count, lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, learning_rate, decay, decay_steps, beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid, None, train_iters, train_batch_size, 0, debug) if transfer: with open(os.path.join(transfer, 'vocab.pkl'), 'rb') as f: self._vocab = pickle.load(f) self._vocab.append( ParserVocabulary( train_file[-1], pretrained_embeddings_file, min_occur_count, root=root, shared_vocab=self._vocab[0], )) else: for t, d in zip(train_file, dev_file): self._vocab.append( ParserVocabulary( t, pretrained_embeddings_file, min_occur_count, root=root, shared_vocab=None if len(self._vocab) == 0 else self._vocab[0], )) with open(config.save_vocab_path, 'wb') as f: pickle.dump(self._vocab, f) for voc in self._vocab: voc.log_info(logger) with mx.Context(mxnet_prefer_gpu()): data_loaders = [ DataLoader(t, num_buckets_train, vocab, bert=bert_path[0] if bert_path else None) for t, vocab in zip(train_file, self._vocab) ] config.bert_dim = data_loaders[0].bert_dim config.save() self._parser = parser = self.cls_parser( self._vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, bert=data_loaders[0].bert_dim, debug=debug) if transfer: parser.transfer = True parser.fill(transfer) parser.initialize() scheduler = ExponentialScheduler(learning_rate, decay, decay_steps) optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon, lr_scheduler=scheduler) trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer) global_step = 0 best_LF = 0. batch_id = 0 epoch = 1 total_epoch = math.ceil(train_iters / validate_every) logger.info("Epoch {} out of {}".format(epoch, total_epoch)) bar = Progbar(target=min(validate_every, train_iters)) gs = [ dl.get_batches(batch_size=train_batch_size, shuffle=False) for dl in data_loaders ] while global_step < train_iters: arcs_tasks = [] rels_tasks = [] bert_tasks = [] for g in gs: words, bert, tags, arcs, rels = next( g, (None, None, None, None, None)) if words is None: break arcs_tasks.append(arcs) rels_tasks.append(rels) bert_tasks.append(bert) if words is None: gs = [ dl.get_batches(batch_size=train_batch_size, shuffle=False) for dl in data_loaders ] continue with autograd.record(): arc_accuracy, rel_accuracy, loss = parser.forward( words, bert, tags, arcs_tasks, rels_tasks) loss_value = loss.asscalar() loss.backward() trainer.step(train_batch_size) batch_id += 1 try: bar.update(batch_id, exact=[("LR", rel_accuracy, 2), ("loss", loss_value)]) except OverflowError: pass # sometimes loss can be 0 or infinity, crashes the bar global_step += 1 if global_step % validate_every == 0: batch_id = 0 UF, LF, speed = evaluate_joint_official_script( parser, self._vocab, num_buckets_valid, dev_batch_size, dev_file, os.path.join(save_dir, 'dev.predict.conllu'), bert=None if bert_path is None else bert_path[1]) score_str = '' for dataset, lf in zip(dev_file, LF): dataset = os.path.basename(dataset).replace( '.conllu', '') lf = lf * 100 score_str += '{}={:0.1f} '.format(dataset, lf) if transfer: LF = LF[-1] * 100 else: LF = sum(LF) / len(LF) * 100 score_str += '{}={:0.1f} '.format('avg', LF) logger.info(score_str + '%d sents/s' % (speed)) epoch += 1 bar = Progbar(target=min(validate_every, train_iters - global_step)) if global_step > save_after and LF > best_LF: logger.info('- new best score!') best_LF = LF parser.save(config.save_model_path) if global_step < train_iters: logger.info("Epoch {} out of {}".format( epoch, total_epoch)) # When validate_every is too big if not os.path.isfile(config.save_model_path) or best_LF == 0: parser.save(config.save_model_path) return self
def train(self, train_file, dev_file, save_dir, pretrained_embeddings_file=None, min_occur_count=2, lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400, dropout_lstm_input=0.33, dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100, dropout_mlp=0.33, learning_rate=1e-3, decay=.75, decay_steps=5000, beta_1=.9, beta_2=.9, epsilon=1e-12, num_buckets_train=40, num_buckets_valid=10, train_iters=50000, train_batch_size=5000, dev_batch_size=5000, validate_every=100, save_after=5000, root='root', bert_path=None, interpolation=0.5, debug=False): if pretrained_embeddings_file is None: word_dims = False logger = init_logger(save_dir) config = _Config(train_file, dev_file, None, save_dir, pretrained_embeddings_file, min_occur_count, lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, learning_rate, decay, decay_steps, beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid, None, train_iters, train_batch_size, debug) self._vocab = vocab = ParserVocabulary(train_file, pretrained_embeddings_file, min_occur_count, root=root) vocab.save(config.save_vocab_path) vocab.log_info(logger) with mx.Context(mxnet_prefer_gpu()): data_loader = DepDataLoader( train_file, num_buckets_train, vocab, bert=bert_path[0] if bert_path else None) config.bert_dim = data_loader.bert_dim config.save() self._parser = parser = BiaffineDepParser( vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, bert=data_loader.bert_dim, interpolation=interpolation, debug=debug) parser.initialize() scheduler = ExponentialScheduler(learning_rate, decay, decay_steps) optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon, lr_scheduler=scheduler) trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer) global_step = 0 best_LF = 0. batch_id = 0 epoch = 1 total_epoch = math.ceil(train_iters / validate_every) logger.info("Epoch {} out of {}".format(epoch, total_epoch)) bar = Progbar(target=min(validate_every, train_iters)) while global_step < train_iters: for words, bert, tags, arcs, rels in data_loader.get_batches( batch_size=train_batch_size, shuffle=False): with autograd.record(): arc_accuracy, rel_accuracy, loss = parser.forward( words, bert, tags, arcs, rels) loss_value = loss.asscalar() loss.backward() trainer.step(train_batch_size) batch_id += 1 try: bar.update(batch_id, exact=[("LR", rel_accuracy, 2), ("loss", loss_value)]) except OverflowError: pass # sometimes loss can be 0 or infinity, crashes the bar global_step += 1 if global_step % validate_every == 0: batch_id = 0 UF, LF, speed = dep_evaluate_official_script( parser, vocab, num_buckets_valid, dev_batch_size, dev_file, os.path.join(save_dir, 'dev.predict.conllu'), bert=None if bert_path is None else bert_path[1]) logger.info('Dev: LF=%.1f%% %d sents/s' % (LF, speed)) epoch += 1 bar = Progbar(target=min(validate_every, train_iters - global_step)) if global_step > save_after and LF > best_LF: logger.info('- new best score!') best_LF = LF parser.save(config.save_model_path) if global_step < train_iters: logger.info("Epoch {} out of {}".format( epoch, total_epoch)) # When validate_every is too big if not os.path.isfile(config.save_model_path): parser.save(config.save_model_path) return self
def evaluate(self, test_file, save_dir=None, logger=None, num_buckets_test=10, test_batch_size=5000, bert_path=None, chinese=False, debug=False): """Run evaluation on test set Parameters ---------- test_file : str path to test set save_dir : str where to store intermediate results and log logger : logging.logger logger for printing results num_buckets_test : int number of clusters for sentences from test set test_batch_size : int batch size of test set Returns ------- tuple UAS, LAS """ parser = self._parser vocab = self._vocab if logger is None: logger = init_logger(save_dir, 'test.log') with mx.Context(mxnet_prefer_gpu()): if chinese: result, speed = evaluate_chinese_sdp( parser, vocab, num_buckets_test, test_batch_size, test_file, os.path.join(save_dir, 'test.predict.conllu'), bert=bert_path, debug=debug) logger.info(test_file) for k, v in result.items(): logger.info('%s=%.2f%%' % (k, v)) return result else: UF, LF, speed = evaluate_official_script( parser, vocab, num_buckets_test, test_batch_size, test_file, os.path.join(save_dir, os.path.basename(test_file)), bert=bert_path, debug=debug) UF = UF * 100 LF = LF * 100 logger.info('Test: UF=%.2f%% LF=%.2f%% %d sents/s' % (UF, LF, speed)) return LF
def train(self, base_path: str, sequence_length: int, learning_rate: float = 20, mini_batch_size: int = 100, anneal_factor: float = 0.25, patience: int = 10, clip=0.25, max_epochs: int = 10000): number_of_splits = len(self.corpus.train_files) val_data = self._batchify(self.corpus.valid, mini_batch_size) os.makedirs(base_path, exist_ok=True) loss_txt = os.path.join(base_path, 'loss.txt') savefile = os.path.join(base_path, 'best-lm.pt') try: with mx.Context(mxnet_prefer_gpu()): self.model.initialize() best_val_loss = 100000000 scheduler = ReduceLROnPlateau(lr=learning_rate, verbose=True, factor=anneal_factor, patience=patience) optimizer = mx.optimizer.SGD(learning_rate=learning_rate, lr_scheduler=scheduler) trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) for epoch in range(1, max_epochs + 1): print('Split %d' % epoch + '\t - ({:%H:%M:%S})'.format(datetime.datetime.now())) # for group in optimizer.param_groups: # learning_rate = group['lr'] train_slice = self.corpus.get_next_train_slice() train_data = self._batchify(train_slice, mini_batch_size) print('\t({:%H:%M:%S})'.format(datetime.datetime.now())) # go into train mode # self.model.train() # reset variables epoch_start_time = time.time() total_loss = 0 start_time = time.time() hidden = self.model.init_hidden(mini_batch_size) cell = hidden.copy() # not really sure what this does ntokens = len(self.corpus.dictionary) # do batches for batch, i in enumerate( range(0, len(train_data) - 1, sequence_length)): data, targets = self._get_batch( train_data, i, sequence_length) # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. hidden = self._repackage_hidden(hidden) cell = self._repackage_hidden(cell) # self.model.zero_grad() # optimizer.zero_grad() # do the forward pass in the model with autograd.record(): output, rnn_output, hidden, cell = self.model.forward( data, hidden, cell) # try to predict the targets loss = self.loss_function( output.reshape(-1, ntokens), targets).mean() loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. # torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip) trainer.step(mini_batch_size) total_loss += loss.asscalar() if batch % self.log_interval == 0 and batch > 0: cur_loss = total_loss.item() / self.log_interval elapsed = time.time() - start_time print( '| split {:3d} /{:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, number_of_splits, batch, len(train_data) // sequence_length, elapsed * 1000 / self.log_interval, cur_loss, self._safe_exp(cur_loss))) total_loss = 0 start_time = time.time() print('epoch {} done! \t({:%H:%M:%S})'.format( epoch, datetime.datetime.now())) scheduler.step(cur_loss) ############################################################################### # TEST ############################################################################### # skip evaluation # val_loss = self.evaluate(val_data, mini_batch_size, sequence_length) # scheduler.step(val_loss) # # # Save the model if the validation loss is the best we've seen so far. # if val_loss < best_val_loss: # self.model.save(savefile) # best_val_loss = val_loss # print('best loss so far {:5.2f}'.format(best_val_loss)) val_loss = cur_loss if (self.corpus.current_train_file_index + 1) % 100 == 0 or self.corpus.is_last_slice: self.model.save(savefile) ############################################################################### # print info ############################################################################### print('-' * 89) local_split_number = epoch % number_of_splits if local_split_number == 0: local_split_number = number_of_splits summary = '| end of split {:3d} /{:3d} | epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' \ 'valid ppl {:8.2f} | learning rate {:3.2f}'.format(local_split_number, number_of_splits, epoch, (time.time() - epoch_start_time), val_loss, self._safe_exp(val_loss), learning_rate) with open(loss_txt, "a") as myfile: myfile.write('%s\n' % summary) print(summary) print('-' * 89) except KeyboardInterrupt: print('-' * 89) print('Exiting from training early')
train_file='train.short.tsv', test_file='test.tsv', dev_file='dev.tsv' # train_file='debug.tsv', # test_file='debug.tsv', # dev_file='debug.tsv' ) # 2. what tag do we want to predict? tag_type = 'pos' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings with mx.Context(mxnet_prefer_gpu()): embedding_types = [ # WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'), # WordEmbeddings('data/embedding/glove/glove.6B.100d.debug.txt'), # CharLMEmbeddings('data/model/lm-news-forward'), # CharLMEmbeddings('data/model/lm-news-backward'), BERTEmbeddings(['data/embedding/bert_large_cased/wsj.train.short.bert', 'data/embedding/bert_large_cased/wsj.dev.bert', 'data/embedding/bert_large_cased/wsj.test.bert']), ] embeddings = StackedEmbeddings(embeddings=embedding_types) # 5. initialize sequence tagger tagger = SequenceTagger(hidden_size=256, embeddings=embeddings,
# -*- coding:utf-8 -*- # Author: hankcs # Date: 2019-01-13 15:01 import os import pickle import numpy as np from bert_embedding import BertEmbedding from bertsota.common.utils import mxnet_prefer_gpu bert = BertEmbedding(model='bert_24_1024_16', dataset_name='book_corpus_wiki_en_cased', max_seq_length=270, ctx=mxnet_prefer_gpu()) def embed_bert(sents): result = bert.embedding(sents) return [np.stack(s[2]) for s in result] def make_bert_for(path, output): print(output) os.makedirs(os.path.dirname(output), exist_ok=True) total = 0 with open(path) as src: batch = [] tensor = [] for line in src: line = line.strip()
def train(self, train_file, dev_file, save_dir, pretrained_embeddings_file=None, min_occur_count=2, lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400, dropout_lstm_input=0.33, dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100, dropout_mlp=0.33, learning_rate=1e-3, decay=.75, decay_steps=5000, beta_1=.9, beta_2=.9, epsilon=1e-12, num_buckets_train=40, num_buckets_valid=10, train_iters=50000, train_batch_size=5000, dev_batch_size=5000, validate_every=100, save_after=5000, root='root', bert_path=None, debug=False): """Train a deep biaffine dependency parser Parameters ---------- train_file : str path to training set dev_file : str path to dev set save_dir : str a directory for saving model and related meta-data pretrained_embeddings_file : str pre-trained embeddings file, plain text format min_occur_count : int threshold of rare words, which will be replaced with UNKs, lstm_layers : int layers of lstm word_dims : int dimension of word embedding tag_dims : int dimension of tag embedding dropout_emb : float word dropout lstm_hiddens : int size of lstm hidden states dropout_lstm_input : int dropout on x in variational RNN dropout_lstm_hidden : int dropout on h in variational RNN mlp_arc_size : int output size of MLP for arc feature extraction mlp_rel_size : int output size of MLP for rel feature extraction dropout_mlp : float dropout on the output of LSTM learning_rate : float learning rate decay : float see ExponentialScheduler decay_steps : int see ExponentialScheduler beta_1 : float see ExponentialScheduler beta_2 : float see ExponentialScheduler epsilon : float see ExponentialScheduler num_buckets_train : int number of buckets for training data set num_buckets_valid : int number of buckets for dev data set train_iters : int training iterations train_batch_size : int training batch size dev_batch_size : int test batch size validate_every : int validate on dev set every such number of batches save_after : int skip saving model in early epochs root : str token for ROOT debug : bool debug mode Returns ------- SDPParser parser itself """ if pretrained_embeddings_file is None: word_dims = 0 logger = init_logger(save_dir) config = _Config(train_file, dev_file, None, save_dir, pretrained_embeddings_file, min_occur_count, lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, learning_rate, decay, decay_steps, beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid, None, train_iters, train_batch_size, debug) self._vocab = vocab = ParserVocabulary(train_file, pretrained_embeddings_file, min_occur_count, root=root) vocab.save(config.save_vocab_path) vocab.log_info(logger) with mx.Context(mxnet_prefer_gpu()): data_loader = DataLoader(train_file, num_buckets_train, vocab, bert=bert_path[0] if bert_path else None) config.bert_dim = data_loader.bert_dim config.save() self._parser = parser = BiaffineParser(vocab, word_dims, tag_dims, dropout_emb, lstm_layers, lstm_hiddens, dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size, dropout_mlp, bert=data_loader.bert_dim, debug=debug) parser.initialize() scheduler = ExponentialScheduler(learning_rate, decay, decay_steps) optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon, lr_scheduler=scheduler) trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer) global_step = 0 best_LF = 0. batch_id = 0 epoch = 1 total_epoch = math.ceil(train_iters / validate_every) logger.info("Epoch {} out of {}".format(epoch, total_epoch)) bar = Progbar(target=min(validate_every, train_iters)) while global_step < train_iters: for words, bert, tags, arcs, rels in data_loader.get_batches( batch_size=train_batch_size, shuffle=False): with autograd.record(): arc_accuracy, rel_accuracy, loss = parser.forward( words, bert, tags, arcs, rels) loss_value = loss.asscalar() loss.backward() trainer.step(train_batch_size) batch_id += 1 try: bar.update(batch_id, exact=[("LR", rel_accuracy, 2), ("loss", loss_value)]) except OverflowError: pass # sometimes loss can be 0 or infinity, crashes the bar global_step += 1 if global_step % validate_every == 0: batch_id = 0 UF, LF, speed = evaluate_official_script( parser, vocab, num_buckets_valid, dev_batch_size, dev_file, os.path.join(save_dir, 'dev.predict.conllu'), bert=None if bert_path is None else bert_path[1]) LF = LF * 100 logger.info('Dev: LF=%.1f%% %d sents/s' % (LF, speed)) epoch += 1 bar = Progbar(target=min(validate_every, train_iters - global_step)) if global_step > save_after and LF > best_LF: logger.info('- new best score!') best_LF = LF parser.save(config.save_model_path) if global_step < train_iters: logger.info("Epoch {} out of {}".format( epoch, total_epoch)) # When validate_every is too big if not os.path.isfile(config.save_model_path): parser.save(config.save_model_path) return self
corpus = NLPTaskDataFetcher.fetch_column_corpus('data/conll03', columns, train_file='train.tsv', test_file='test.tsv', dev_file='dev.tsv', tag_to_biloes='ner') # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary.idx2item) # 4. initialize embeddings with mx.Context(mxnet_prefer_gpu()): embedding_types = [ WordEmbeddings('data/embedding/glove/glove.6B.100d.txt'), BERTEmbeddings([ 'data/embedding/bert_large_sum/conll03.train.bert', 'data/embedding/bert_large_sum/conll03.dev.bert', 'data/embedding/bert_large_sum/conll03.test.bert' ]), # comment in this line to use character embeddings # CharacterEmbeddings(), # comment in these lines to use contextual string embeddings CharLMEmbeddings('data/model/lm-news-forward'), CharLMEmbeddings('data/model/lm-news-backward'), ]
def train(self, base_path: str, learning_rate: float = 0.1, mini_batch_size: int = 32, max_epochs: int = 100, anneal_factor: float = 0.5, patience: int = 2, save_model: bool = True, embeddings_in_memory: bool = True, train_with_dev: bool = False, context: mx.Context = None, show_test=False, cn=False) -> float: """ :param base_path: a folder to store model, log etc. :param learning_rate: :param mini_batch_size: :param max_epochs: :param anneal_factor: :param patience: :param save_model: :param embeddings_in_memory: :param train_with_dev: :return: best dev f1 """ evaluation_method = 'F1' if self.model.tag_type in ['ner', 'np', 'srl']: evaluation_method = 'span-F1' if self.model.tag_type in ['pos', 'upos']: evaluation_method = 'accuracy' print(evaluation_method) os.makedirs(base_path, exist_ok=True) loss_txt = os.path.join(base_path, "loss.txt") open(loss_txt, "w", encoding='utf-8').close() anneal_mode = 'min' if train_with_dev else 'max' train_data = self.corpus.train # if training also uses dev data, include in training set if train_with_dev: train_data.extend(self.corpus.dev) # At any point you can hit Ctrl + C to break out of training early. try: with mx.Context(context if context else mxnet_prefer_gpu()): self.model.initialize() scheduler = ReduceLROnPlateau(lr=learning_rate, verbose=True, factor=anneal_factor, patience=patience, mode=anneal_mode) optimizer = mx.optimizer.SGD(learning_rate=learning_rate, lr_scheduler=scheduler, clip_gradient=5.0) trainer = gluon.Trainer(self.model.collect_params(), optimizer=optimizer) for epoch in range(0, max_epochs): current_loss = 0 if not self.test_mode: random.shuffle(train_data) batches = [ train_data[x:x + mini_batch_size] for x in range(0, len(train_data), mini_batch_size) ] batch_no = 0 for batch in batches: batch = batch batch_no += 1 # if batch_no % 100 == 0: # print("%d of %d (%f)" % (batch_no, len(batches), float(batch_no / len(batches)))) # Step 4. Compute the loss, gradients, and update the parameters by calling optimizer.step() batch.sort(key=lambda x: len(x), reverse=True) with autograd.record(): self.model.embeddings.embed(batch) loss = self.model.neg_log_likelihood( batch, self.model.tag_type) current_loss += loss.sum().asscalar() loss.backward() # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0) # optimizer.step() trainer.step(len(batch)) sys.stdout.write( "\r%.2f%%" % (batch_no / float(len(batches)) * 100)) sys.stdout.flush() if not embeddings_in_memory: self.clear_embeddings_in_batch(batch) current_loss /= len(train_data) if not train_with_dev: print('.. evaluating... dev... ') dev_score, dev_fp, dev_result = self.evaluate( self.corpus.dev, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory, cn=cn) else: dev_fp = 0 dev_result = '_' # anneal against train loss if training with dev, otherwise anneal against dev score scheduler.step( current_loss) if train_with_dev else scheduler.step( dev_score) # save if model is current best and we use dev data for model selection if save_model and not train_with_dev and dev_score == scheduler.best: self.model.save(base_path) summary = '%d' % epoch + '\t({:%H:%M:%S})'.format(datetime.datetime.now()) \ + '\t%f\t%d\t%f\tDEV %d\t' % ( current_loss, scheduler.num_bad_epochs, learning_rate, dev_fp) + dev_result summary = summary.replace('\n', '') if self.corpus.test and len( self.corpus.test) and show_test: print('test... ') test_score, test_fp, test_result = self.evaluate( self.corpus.test, base_path, evaluation_method=evaluation_method, embeddings_in_memory=embeddings_in_memory, cn=cn) summary += '\tTEST \t%d\t' % test_fp + test_result with open(loss_txt, "a") as loss_file: loss_file.write('%s\n' % summary) loss_file.close() print(summary) # if we do not use dev data for model selection, save final model if save_model and train_with_dev: self.model.save(base_path) return scheduler.best # return maximum dev f1 except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') print('saving model') self.model.save(base_path + "/final-model") print('done')