def __init__(self, args=None, feature_config=None): if args is None: args = DEFAULT_PARSER_ARGS.copy() if feature_config is None: self.feature_config = FEATURE_CONFIG.copy() model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) self.pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] self.trainer = Trainer(pretrain=self.pretrain, model_file=model_file, use_cuda=use_cuda) self.loaded_args, self.vocab = self.trainer.args, self.trainer.vocab self.batch_size = args['batch_size'] # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in [ 'shorthand' ] or k == 'mode': self.loaded_args[k] = args[k]
def _set_up_model(self, config, use_gpu): # get pretrained word vectors self._pretrain = Pretrain(config['pretrain_path']) # set up trainer self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu)
def _set_up_model(self, config, use_gpu): # get pretrained word vectors self._pretrain = Pretrain( config['pretrain_path']) if 'pretrain_path' in config else None # set up trainer self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu) self._tqdm = 'tqdm' in config and config['tqdm']
def _set_up_model(self, config, use_gpu): # get pretrained word vectors self._pretrain = Pretrain( config['pretrain_path']) if 'pretrain_path' in config else None self._n_pred = config['n_pred'] if 'n_pred' in config else 1 # set up trainer self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu, n_pred=self._n_pred)
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrain; note that we allow the pretrain_file to be non-existent pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file) # load model print("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data print("Loading data with batch size {}...".format(args['batch_size'])) doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: print("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x]) CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) print("Tagger score:") print("{} {:.2f}".format(args['shorthand'], score * 100))
def _set_up_model(self, config, use_gpu): # get pretrained word vectors self._pretrain = Pretrain(config['pretrain_path']) # set up model self._model = cnn_classifier.load(filename=config['model_path'], pretrain=self._pretrain) self._batch_size = config.get('batch_size', None) # TODO: move this call to load() if use_gpu: self._model.cuda()
def load_pretrain(args): pretrain = None if args['pretrain']: if args['wordvec_pretrain_file']: pretrain_file = args['wordvec_pretrain_file'] else: pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) if os.path.exists(pretrain_file): vec_file = None else: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) return pretrain
def _set_up_model(self, config, use_gpu): self._pretrain = Pretrain( config['pretrain_path']) if 'pretrain_path' in config else None n_parses = config['n_parses'] if 'n_parses' in config else 3 kalm_shuffle = config[ 'kalm_shuffle'] if 'kalm_shuffle' in config else False automatic_n_parses = config[ 'automatic_n_parses'] if 'automatic_n_parses' in config else False self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu, n_parses=n_parses, kalm_shuffle=kalm_shuffle, automatic_n_parses=automatic_n_parses)
def _set_up_model(self, config, use_gpu): # get pretrained word vectors pretrain_path = config.get('pretrain_path', None) self._pretrain = Pretrain(pretrain_path) if pretrain_path else None # set up model charlm_forward_file = config.get('forward_charlm_path', None) charlm_backward_file = config.get('backward_charlm_path', None) self._model = trainer.Trainer.load( filename=config['model_path'], pt=self._pretrain, forward_charlm=trainer.load_charlm(charlm_forward_file), backward_charlm=trainer.load_charlm(charlm_backward_file), use_gpu=use_gpu) # batch size counted as sentences self._batch_size = config.get('batch_size', ConstituencyProcessor.DEFAULT_BATCH_SIZE)
def _set_up_model(self, config, use_gpu): # get pretrained word vectors pretrain_path = config.get('pretrain_path', None) self._pretrain = Pretrain(pretrain_path) if pretrain_path else None forward_charlm_path = config.get('forward_charlm_path', None) charmodel_forward = CharacterLanguageModel.load(forward_charlm_path, finetune=False) if forward_charlm_path else None backward_charlm_path = config.get('backward_charlm_path', None) charmodel_backward = CharacterLanguageModel.load(backward_charlm_path, finetune=False) if backward_charlm_path else None # set up model self._model = cnn_classifier.load(filename=config['model_path'], pretrain=self._pretrain, charmodel_forward=charmodel_forward, charmodel_backward=charmodel_backward) # batch size counted as words self._batch_size = config.get('batch_size', SentimentProcessor.DEFAULT_BATCH_SIZE) # TODO: move this call to load() if use_gpu: self._model.cuda()
def load_pretrain(args): if args.wordvec_pretrain_file: pretrain_file = args.wordvec_pretrain_file elif args.wordvec_type: pretrain_file = '{}/{}.{}.pretrain.pt'.format( args.save_dir, args.shorthand, args.wordvec_type.name.lower()) else: raise Exception( "TODO: need to get the wv type back from get_wordvec_file") logger.info("Looking for pretrained vectors in {}".format(pretrain_file)) if os.path.exists(pretrain_file): vec_file = None elif args.wordvec_raw_file: vec_file = args.wordvec_raw_file logger.info("Pretrain not found. Looking in {}".format(vec_file)) else: vec_file = utils.get_wordvec_file(args.wordvec_dir, args.shorthand, args.wordvec_type.name.lower()) logger.info("Pretrain not found. Looking in {}".format(vec_file)) pretrain = Pretrain(pretrain_file, vec_file, args.pretrain_max_vocab) logger.info("Embedding shape: %s" % str(pretrain.emb.shape)) return pretrain
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if len(args['wordvec_file']) == 0: vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) else: vec_file = args['wordvec_file'] # do not save pretrained embeddings individually pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False) if args['charlm']: if args['charlm_shorthand'] is None: logger.info( "CharLM Shorthand is required for loading pretrained CharLM model..." ) sys.exit(0) logger.info('Use pretrained contextualized char embedding') args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format( args['charlm_save_dir'], args['charlm_shorthand']) args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format( args['charlm_save_dir'], args['charlm_shorthand']) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_doc = Document(json.load(open(args['train_file']))) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(json.load(open(args['eval_file']))) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True) dev_gold_tags = dev_batch.tags # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training tagger...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) logger.info(trainer.model) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = trainer.optimizer.param_groups[0]['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # LR scheduling if args['lr_decay'] > 0: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \ patience=args['patience'], verbose=True, min_lr=args['min_lr']) else: scheduler = None # start training train_loss = 0 while True: should_stop = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds _, _, dev_score = scorer.score_by_entity( dev_preds, dev_gold_tags) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("New best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] logger.info("") # lr schedule if scheduler is not None: scheduler.step(dev_score) # check stopping current_lr = trainer.optimizer.param_groups[0]['lr'] if global_step >= args['max_steps'] or current_lr <= args['min_lr']: should_stop = True break if should_stop: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if needed pretrain = None if args['pretrain']: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) # load data print("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("Skip training because no data available...") sys.exit(0) print("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev print("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args['eval_interval'] # avg loss per batch print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] print("") if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: print("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() print("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def _set_up_model(self, config, use_gpu): self._pretrain = Pretrain( config['pretrain_path']) if 'pretrain_path' in config else None self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu)
def train(args): utils.ensure_dir(args['save_dir']) model_file = os.path.join(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand']) pretrain = None vocab = None trainer = None if args['finetune'] and args['finetune_load_name']: logger.warning('Finetune is ON. Using model from "{}"'.format(args['finetune_load_name'])) _, trainer, vocab = load_model(args, args['finetune_load_name']) elif args['finetune'] and os.path.exists(model_file): logger.warning('Finetune is ON. Using model from "{}"'.format(model_file)) _, trainer, vocab = load_model(args, model_file) else: if args['finetune']: raise FileNotFoundError('Finetune is set to true but model file is not found: {}'.format(model_file)) # load pretrained vectors if args['wordvec_pretrain_file']: pretrain_file = args['wordvec_pretrain_file'] pretrain = Pretrain(pretrain_file, None, args['pretrain_max_vocab'], save_to_file=False) else: if len(args['wordvec_file']) == 0: vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) else: vec_file = args['wordvec_file'] # do not save pretrained embeddings individually pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False) if pretrain is not None: word_emb_dim = pretrain.emb.shape[1] if args['word_emb_dim'] and args['word_emb_dim'] != word_emb_dim: logger.warning("Embedding file has a dimension of {}. Model will be built with that size instead of {}".format(word_emb_dim, args['word_emb_dim'])) args['word_emb_dim'] = word_emb_dim if args['charlm']: if args['charlm_shorthand'] is None: raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...") logger.info('Using pretrained contextualized char embedding') if not args['charlm_forward_file']: args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) if not args['charlm_backward_file']: args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(json.load(open(args['train_file']))) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=False) vocab = train_batch.vocab dev_doc = Document(json.load(open(args['eval_file']))) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True) dev_gold_tags = dev_batch.tags if args['finetune']: utils.warn_missing_tags([i for i in trainer.vocab['tag']], train_batch.tags, "training set") utils.warn_missing_tags(train_batch.tags, dev_batch.tags, "dev set") # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training tagger...") if trainer is None: # init if model was not loaded previously from file trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'], train_classifier_only=args['train_classifier_only']) logger.info(trainer.model) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = trainer.optimizer.param_groups[0]['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # LR scheduling if args['lr_decay'] > 0: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \ patience=args['patience'], verbose=True, min_lr=args['min_lr']) else: scheduler = None # start training train_loss = 0 while True: should_stop = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds _, _, dev_score = scorer.score_by_entity(dev_preds, dev_gold_tags) train_loss = train_loss / args['eval_interval'] # avg loss per batch logger.info("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("New best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] logger.info("") # lr schedule if scheduler is not None: scheduler.step(dev_score) # check stopping current_lr = trainer.optimizer.param_groups[0]['lr'] if global_step >= args['max_steps'] or current_lr <= args['min_lr']: should_stop = True break if should_stop: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) if len(dev_score_history) > 0: best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def _set_up_model(self, config, use_gpu): self._pretagged = config.get('pretagged') self._pretrain = Pretrain(config['pretrain_path']) self._trainer = Trainer(pretrain=self.pretrain, model_file=config['model_path'], use_cuda=use_gpu)