def train(args, model_save_file, model_load_file, model_save_latest_file, retag_pipeline): """ Build a model, train it using the requested train & dev files """ print_args(args) utils.ensure_dir(args['save_dir']) train_trees = read_treebank(args['train_file']) logger.info("Read %d trees for the training set", len(train_trees)) dev_trees = read_treebank(args['eval_file']) logger.info("Read %d trees for the dev set", len(dev_trees)) if retag_pipeline is not None: logger.info("Retagging trees using the %s tags from the %s package...", args['retag_method'], args['retag_package']) train_trees = retag_trees(train_trees, retag_pipeline, args['retag_xpos']) dev_trees = retag_trees(dev_trees, retag_pipeline, args['retag_xpos']) logger.info("Retagging finished") pt = load_pretrain(args) forward_charlm = load_charlm(args['charlm_forward_file']) backward_charlm = load_charlm(args['charlm_backward_file']) trainer, train_sequences, train_transitions = build_trainer( args, train_trees, dev_trees, pt, forward_charlm, backward_charlm) iterate_training(trainer, train_trees, train_sequences, train_transitions, dev_trees, args, model_save_file, model_save_latest_file)
def run_dev_set(model, dev_trees, args): """ This reparses a treebank and executes the CoreNLP Java EvalB code. It only works if CoreNLP 4.3.0 or higher is in the classpath. """ logger.info("Processing %d trees from %s", len(dev_trees), args['eval_file']) model.eval() tree_iterator = iter(tqdm(dev_trees)) treebank = parse_sentences(tree_iterator, build_batch_from_trees, args['eval_batch_size'], model) if len(treebank) < len(dev_trees): logger.warning("Only evaluating %d trees instead of %d", len(treebank), len(dev_trees)) if args['mode'] == 'predict' and args['predict_file']: utils.ensure_dir(args['predict_dir'], verbose=False) pred_file = os.path.join(args['predict_dir'], args['predict_file'] + ".pred.mrg") orig_file = os.path.join(args['predict_dir'], args['predict_file'] + ".orig.mrg") if os.path.exists(pred_file): logger.warning( "Cowardly refusing to overwrite {}".format(pred_file)) elif os.path.exists(orig_file): logger.warning( "Cowardly refusing to overwrite {}".format(orig_file)) else: with open(pred_file, 'w') as fout: for tree in treebank: fout.write(str(tree[1][0][0])) fout.write("\n") with open(orig_file, 'w') as fout: for tree in treebank: fout.write(str(tree[0])) fout.write("\n") with EvaluateParser() as evaluator: response = evaluator.process(treebank) return response.f1
def main(args=None): args = parse_args(args=args) if args.cpu: args.cuda = False utils.set_random_seed(args.seed, args.cuda) args = vars(args) logger.info("Running tokenizer in {} mode".format(args['mode'])) args['feat_funcs'] = ['space_before', 'capitalized', 'all_caps', 'numeric'] args['feat_dim'] = len(args['feat_funcs']) args['save_name'] = "{}/{}".format(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_tokenizer.pt'.format(args['save_dir'], args['shorthand']) utils.ensure_dir(args['save_dir']) if args['mode'] == 'train': train(args) else: evaluate(args)
def main(): args = parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cpu: args.cuda = False elif args.cuda: torch.cuda.manual_seed(args.seed) args = vars(args) logger.info("Running {} character-level language model in {} mode".format( args['direction'], args['mode'])) utils.ensure_dir(args['save_dir']) if args['mode'] == 'train': train(args) else: evaluate(args)
def main(): args = parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) if args.cpu: args.cuda = False elif args.cuda: torch.cuda.manual_seed(args.seed) args = vars(args) print("Running tokenizer in {} mode".format(args['mode'])) args['feat_funcs'] = ['space_before', 'capitalized', 'all_caps', 'numeric'] args['feat_dim'] = len(args['feat_funcs']) args['save_name'] = "{}/{}".format(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_tokenizer.pt'.format(args['save_dir'], args['shorthand']) utils.ensure_dir(args['save_dir']) if args['mode'] == 'train': train(args) else: evaluate(args)
def main(args=None): args = parse_args(args=args) if args.cpu: args.cuda = False utils.set_random_seed(args.seed, args.cuda) args = vars(args) logger.info("Running tokenizer in {} mode".format(args['mode'])) args['feat_funcs'] = [ 'space_before', 'capitalized', 'numeric', 'end_of_para', 'start_of_para' ] args['feat_dim'] = len(args['feat_funcs']) save_name = args['save_name'] if args[ 'save_name'] else '{}_tokenizer.pt'.format(args['shorthand']) args['save_name'] = os.path.join(args['save_dir'], save_name) utils.ensure_dir(args['save_dir']) if args['mode'] == 'train': train(args) else: evaluate(args)
def train(args): # load data logger.debug('max_dec_len: %d' % args['max_dec_len']) logger.debug("Loading data with batch size {}...".format( args['batch_size'])) train_doc = CoNLL.conll2doc(input_file=args['train_file']) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab.size dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['save_dir']) save_name = args['save_name'] if args[ 'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand']) model_file = os.path.join(args['save_dir'], save_name) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.warning("Skip training because no data available...") return # train a dictionary-based MWT expander trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) logger.info("Training dictionary-based MWT expander...") trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False)) logger.info("Evaluating on dev set...") dev_preds = trainer.predict_dict( dev_batch.doc.get_mwt_expansions(evaluation=True)) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) logger.info("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model logger.info("Training seq2seq-based MWT expander...") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for i, batch in enumerate(dev_batch): preds = trainer.predict(batch) dev_preds += preds if args.get('ensemble_dict', False) and args.get( 'ensemble_early_stop', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch logger.info( "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[ -1]: current_lr *= args['lr_decay'] trainer.change_lr(current_lr) dev_score_history += [dev_score] logger.info("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at epoch = {}".format( best_f, best_epoch)) # try ensembling with dict if necessary if args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100)) best_f = max(best_f, dev_score)
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if len(args['wordvec_file']) == 0: vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) else: vec_file = args['wordvec_file'] # do not save pretrained embeddings individually pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False) if args['charlm']: if args['charlm_shorthand'] is None: logger.info( "CharLM Shorthand is required for loading pretrained CharLM model..." ) sys.exit(0) logger.info('Use pretrained contextualized char embedding') args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format( args['charlm_save_dir'], args['charlm_shorthand']) args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format( args['charlm_save_dir'], args['charlm_shorthand']) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_doc = Document(json.load(open(args['train_file']))) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(json.load(open(args['eval_file']))) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True) dev_gold_tags = dev_batch.tags # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training tagger...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) logger.info(trainer.model) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = trainer.optimizer.param_groups[0]['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # LR scheduling if args['lr_decay'] > 0: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \ patience=args['patience'], verbose=True, min_lr=args['min_lr']) else: scheduler = None # start training train_loss = 0 while True: should_stop = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds _, _, dev_score = scorer.score_by_entity( dev_preds, dev_gold_tags) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("New best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] logger.info("") # lr schedule if scheduler is not None: scheduler.step(dev_score) # check stopping current_lr = trainer.optimizer.param_groups[0]['lr'] if global_step >= args['max_steps'] or current_lr <= args['min_lr']: should_stop = True break if should_stop: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
def train(args): # load data print("[Loading data with batch size {}...]".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab['char'].size args['pos_vocab_size'] = vocab['pos'].size dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['model_dir']) model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang']) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] utils.print_config(args) # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("[Skip training because no data available...]") sys.exit(0) # start training # train a dictionary-based lemmatizer trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) print("[Training dictionary-based lemmatizer...]") trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA])) print("Evaluating on dev set...") dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS])) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) print("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model print("[Training seq2seq-based lemmatizer...]") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev print("Evaluating on dev set...") dev_preds = [] dev_edits = [] for i, batch in enumerate(dev_batch): preds, edits = trainer.predict(batch, args['beam_size']) dev_preds += preds if edits is not None: dev_edits += edits dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]), dev_preds, edits=dev_edits) # try ensembling with dict if necessary if args.get('ensemble_dict', False): print("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]), dev_preds) dev_batch.doc.set([LEMMA], dev_preds) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \ args['optim'] in ['sgd', 'adagrad']: current_lr *= args['lr_decay'] trainer.update_lr(current_lr) dev_score_history += [dev_score] print("") print("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
def train(args): model_file = model_file_name(args) utils.ensure_dir(os.path.split(model_file)[0]) # load pretrained vectors if needed pretrain = load_pretrain(args) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_data, _ = CoNLL.conll2dict(input_file=args['train_file']) # possibly augment the training data with some amount of fake data # based on the options chosen logger.info("Original data size: {}".format(len(train_data))) train_data.extend( augment_punct(train_data, args['augment_nopunct'], keep_original_sentences=False)) logger.info("Augmented data size: {}".format(len(train_data))) train_doc = Document(train_data) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info( format_str.format(global_step, max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.write_doc2conll(dev_batch.doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: logger.info("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
def train(args): utils.ensure_dir(args['save_dir']) model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \ else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand']) # load pretrained vectors if needed pretrain = None if args['pretrain']: vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand']) pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab']) # load data print("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(CoNLL.conll2dict(input_file=args['train_file'])) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file'])) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: print("Skip training because no data available...") sys.exit(0) print("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev print("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args['eval_interval'] # avg loss per batch print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) print("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] print("") if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: print("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() print("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def train(args): utils.ensure_dir(args['save_dir']) model_file = os.path.join(args['save_dir'], args['save_name']) if args['save_name'] is not None \ else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand']) pretrain = None vocab = None trainer = None if args['finetune'] and args['finetune_load_name']: logger.warning('Finetune is ON. Using model from "{}"'.format(args['finetune_load_name'])) _, trainer, vocab = load_model(args, args['finetune_load_name']) elif args['finetune'] and os.path.exists(model_file): logger.warning('Finetune is ON. Using model from "{}"'.format(model_file)) _, trainer, vocab = load_model(args, model_file) else: if args['finetune']: raise FileNotFoundError('Finetune is set to true but model file is not found: {}'.format(model_file)) # load pretrained vectors if args['wordvec_pretrain_file']: pretrain_file = args['wordvec_pretrain_file'] pretrain = Pretrain(pretrain_file, None, args['pretrain_max_vocab'], save_to_file=False) else: if len(args['wordvec_file']) == 0: vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand']) else: vec_file = args['wordvec_file'] # do not save pretrained embeddings individually pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False) if pretrain is not None: word_emb_dim = pretrain.emb.shape[1] if args['word_emb_dim'] and args['word_emb_dim'] != word_emb_dim: logger.warning("Embedding file has a dimension of {}. Model will be built with that size instead of {}".format(word_emb_dim, args['word_emb_dim'])) args['word_emb_dim'] = word_emb_dim if args['charlm']: if args['charlm_shorthand'] is None: raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...") logger.info('Using pretrained contextualized char embedding') if not args['charlm_forward_file']: args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) if not args['charlm_backward_file']: args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand']) # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) train_doc = Document(json.load(open(args['train_file']))) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=False) vocab = train_batch.vocab dev_doc = Document(json.load(open(args['eval_file']))) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True) dev_gold_tags = dev_batch.tags if args['finetune']: utils.warn_missing_tags([i for i in trainer.vocab['tag']], train_batch.tags, "training set") utils.warn_missing_tags(train_batch.tags, dev_batch.tags, "dev set") # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training tagger...") if trainer is None: # init if model was not loaded previously from file trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'], train_classifier_only=args['train_classifier_only']) logger.info(trainer.model) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = trainer.optimizer.param_groups[0]['lr'] global_start_time = time.time() format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # LR scheduling if args['lr_decay'] > 0: scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \ patience=args['patience'], verbose=True, min_lr=args['min_lr']) else: scheduler = None # start training train_loss = 0 while True: should_stop = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds _, _, dev_score = scorer.score_by_entity(dev_preds, dev_gold_tags) train_loss = train_loss / args['eval_interval'] # avg loss per batch logger.info("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history) == 0 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("New best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] logger.info("") # lr schedule if scheduler is not None: scheduler.step(dev_score) # check stopping current_lr = trainer.optimizer.param_groups[0]['lr'] if global_step >= args['max_steps'] or current_lr <= args['min_lr']: should_stop = True break if should_stop: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) if len(dev_score_history) > 0: best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
def main(): args = parse_args() seed = utils.set_random_seed(args.seed, args.cuda) logger.info("Using random seed: %d" % seed) utils.ensure_dir(args.save_dir) # TODO: maybe the dataset needs to be in a torch data loader in order to # make cuda operations faster if args.train: train_set = read_dataset(args.train_file, args.wordvec_type, args.min_train_len) logger.info("Using training set: %s" % args.train_file) logger.info("Training set has %d labels" % len(dataset_labels(train_set))) elif not args.load_name: raise ValueError( "No model provided and not asked to train a model. This makes no sense" ) else: train_set = None pretrain = load_pretrain(args) if args.load_name: model = cnn_classifier.load(args.load_name, pretrain) else: assert train_set is not None labels = dataset_labels(train_set) extra_vocab = dataset_vocab(train_set) model = cnn_classifier.CNNClassifier(pretrain.emb, pretrain.vocab, extra_vocab, labels, args) if args.cuda: model.cuda() logger.info("Filter sizes: %s" % str(model.config.filter_sizes)) logger.info("Filter channels: %s" % str(model.config.filter_channels)) logger.info("Intermediate layers: %s" % str(model.config.fc_shapes)) save_name = args.save_name if not (save_name): save_name = args.base_name + "_" + args.shorthand + "_" save_name = save_name + "FS_%s_" % "_".join( [str(x) for x in model.config.filter_sizes]) save_name = save_name + "C_%d_" % model.config.filter_channels if model.config.fc_shapes: save_name = save_name + "FC_%s_" % "_".join( [str(x) for x in model.config.fc_shapes]) save_name = save_name + "classifier.pt" model_file = os.path.join(args.save_dir, save_name) if args.train: print_args(args) dev_set = read_dataset(args.dev_file, args.wordvec_type, min_len=None) logger.info("Using dev set: %s" % args.dev_file) check_labels(model.labels, dev_set) train_model(model, model_file, args, train_set, dev_set, model.labels) test_set = read_dataset(args.test_file, args.wordvec_type, min_len=None) logger.info("Using test set: %s" % args.test_file) check_labels(model.labels, test_set) if args.test_remap_labels is None: confusion = confusion_dataset(model, test_set) logger.info("Confusion matrix:\n{}".format( format_confusion(confusion, model.labels))) correct, total = confusion_to_accuracy(confusion) logger.info("Macro f1: {}".format(confusion_to_macro_f1(confusion))) else: correct = score_dataset( model, test_set, remap_labels=args.test_remap_labels, forgive_unmapped_labels=args.forgive_unmapped_labels) total = len(test_set) logger.info("Test set: %d correct of %d examples. Accuracy: %f" % (correct, total, correct / total))