def get_factory(sh, fn): print('Resolving vocab option for {}...'.format(sh)) train_file = 'data/pos/{}.train.in.conllu'.format(sh) if not os.path.exists(train_file): raise UserWarning( 'Training data for {} not found in the data directory, falling back to using WordVocab. To generate the ' 'XPOS vocabulary for this treebank properly, please run the following command first:\n' '\tstanza/utils/datasets/prepare_pos_treebank.py {}'.format( fn, fn)) # without the training file, there's not much we can do key = 'WordVocab(data, shorthand, idx=2)' return key doc = CoNLL.conll2doc(input_file=train_file) data = doc.get([TEXT, UPOS, XPOS, FEATS], as_sentences=True) print(f'Original length = {len(data)}') data = filter_data(data, idx=2) print(f'Filtered length = {len(data)}') vocab = WordVocab(data, sh, idx=2, ignore=["_"]) key = 'WordVocab(data, shorthand, idx=2, ignore=["_"])' best_size = len(vocab) - len(VOCAB_PREFIX) if best_size > 20: for sep in ['', '-', '+', '|', ',', ':']: # separators vocab = XPOSVocab(data, sh, idx=2, sep=sep) length = sum( len(x) - len(VOCAB_PREFIX) for x in vocab._id2unit.values()) if length < best_size: key = 'XPOSVocab(data, shorthand, idx=2, sep="{}")'.format(sep) best_size = length return key
def check_mwt(filename): """ Checks whether or not there are MWTs in the given conll file """ doc = CoNLL.conll2doc(filename) data = doc.get_mwt_expansions(False) return len(data) > 0
def main(args=None): args = parse_args(args=args) random.seed(args.seed) args = vars(args) logger.info("[Launching identity lemmatizer...]") if args['mode'] == 'train': logger.info( "[No training is required; will only generate evaluation output...]" ) document = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(document, args['batch_size'], args, evaluation=True, conll_only=True) system_pred_file = args['output_file'] gold_file = args['gold_file'] # use identity mapping for prediction preds = batch.doc.get([TEXT]) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.write_doc2conll(batch.doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Lemma score:") logger.info("{} {:.2f}".format(args['lang'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] save_name = args['save_name'] if args[ 'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand']) model_file = os.path.join(args['save_dir'], save_name) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] logger.debug('max_dec_len: %d' % loaded_args['max_dec_len']) # load data logger.debug("Loading data with batch size {}...".format( args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) if len(batch) > 0: dict_preds = trainer.predict_dict( batch.doc.get_mwt_expansions(evaluation=True)) # decide trainer type and run eval if loaded_args['dict_only']: preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) if loaded_args.get('ensemble_dict', False): preds = trainer.ensemble( batch.doc.get_mwt_expansions(evaluation=True), preds) else: # skip eval if dev data does not exist preds = [] # write to file and score doc = copy.deepcopy(batch.doc) doc.set_mwt_expansions(preds) CoNLL.write_doc2conll(doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("MWT expansion score: {} {:.2f}".format( args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = model_file_name(args) # load pretrained vectors if needed pretrain = load_pretrain(args) # load model logger.info("Loading model from: {}".format(model_file)) use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(pretrain=pretrain, model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab # load config for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand' ] or k == 'mode': loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) if len(batch) > 0: logger.info("Start evaluation...") preds = [] for i, b in enumerate(batch): preds += trainer.predict(b) else: # skip eval if dev data does not exist preds = [] preds = utils.unsort(preds, batch.data_orig_idx) # write to file and score batch.doc.set([HEAD, DEPREL], [y for x in preds for y in x]) CoNLL.write_doc2conll(batch.doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Parser score:") logger.info("{} {:.2f}".format(args['shorthand'], score * 100))
def evaluate(args): # file paths system_pred_file = args['output_file'] gold_file = args['gold_file'] model_file = os.path.join(args['save_dir'], '{}_lemmatizer.pt'.format(args['lang'])) # load model use_cuda = args['cuda'] and not args['cpu'] trainer = Trainer(model_file=model_file, use_cuda=use_cuda) loaded_args, vocab = trainer.args, trainer.vocab for k in args: if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand']: loaded_args[k] = args[k] # load data logger.info("Loading data with batch size {}...".format(args['batch_size'])) doc = CoNLL.conll2doc(input_file=args['eval_file']) batch = DataLoader(doc, args['batch_size'], loaded_args, vocab=vocab, evaluation=True) # skip eval if dev data does not exist if len(batch) == 0: logger.warning("Skip evaluation because no dev data is available...\nLemma score:\n{} ".format(args['lang'])) return dict_preds = trainer.predict_dict(batch.doc.get([TEXT, UPOS])) if loaded_args.get('dict_only', False): preds = dict_preds else: logger.info("Running the seq2seq model...") preds = [] edits = [] for i, b in enumerate(batch): ps, es = trainer.predict(b, args['beam_size']) preds += ps if es is not None: edits += es preds = trainer.postprocess(batch.doc.get([TEXT]), preds, edits=edits) if loaded_args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq lemmatizer...]") preds = trainer.ensemble(batch.doc.get([TEXT, UPOS]), preds) # write to file and score batch.doc.set([LEMMA], preds) CoNLL.write_doc2conll(batch.doc, system_pred_file) if gold_file is not None: _, _, score = scorer.score(system_pred_file, gold_file) logger.info("Finished evaluation\nLemma score:\n{} {:.2f}".format(args['lang'], score*100))
def test_depparse_with_pretagged_doc(): nlp = stanza.Pipeline( **{ 'processors': 'depparse', 'dir': TEST_MODELS_DIR, 'lang': 'en', 'depparse_pretagged': True }) doc = CoNLL.conll2doc(input_str=EN_DOC_CONLLU_PRETAGGED) processed_doc = nlp(doc) assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join( [sent.dependencies_string() for sent in processed_doc.sentences])
def test_unusual_misc(): """ The above RUSSIAN_SAMPLE resulted in a blank misc field in one particular implementation of the conll code (the below test would fail) """ doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) sentences = CoNLL.doc2conll(doc) assert len(sentences) == 1 assert len(sentences[0]) == 14 for word in sentences[0]: pieces = word.split("\t") assert len(pieces) == 1 or len(pieces) == 10 if len(pieces) == 10: assert all(piece for piece in pieces)
def test_doc_with_comments(): """ Test that a doc with comments gets converted back with comments """ lines = RUSSIAN_SAMPLE.split("\n") doc = CoNLL.conll2doc(input_str=RUSSIAN_SAMPLE) assert len(doc.sentences) == 1 assert len(doc.sentences[0].comments) == 3 assert lines[0] == doc.sentences[0].comments[0] assert lines[1] == doc.sentences[0].comments[1] assert lines[2] == doc.sentences[0].comments[2] sentences = CoNLL.doc2conll(doc) assert len(sentences) == 1 sentence = sentences[0] assert len(sentence) == 14 assert lines[0] == sentence[0] assert lines[1] == sentence[1] assert lines[2] == sentence[2]
def train(args): # load data logger.debug('max_dec_len: %d' % args['max_dec_len']) logger.debug("Loading data with batch size {}...".format( args['batch_size'])) train_doc = CoNLL.conll2doc(input_file=args['train_file']) train_batch = DataLoader(train_doc, args['batch_size'], args, evaluation=False) vocab = train_batch.vocab args['vocab_size'] = vocab.size dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, vocab=vocab, evaluation=True) utils.ensure_dir(args['save_dir']) save_name = args['save_name'] if args[ 'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand']) model_file = os.path.join(args['save_dir'], save_name) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.warning("Skip training because no data available...") return # train a dictionary-based MWT expander trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda']) logger.info("Training dictionary-based MWT expander...") trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False)) logger.info("Evaluating on dev set...") dev_preds = trainer.predict_dict( dev_batch.doc.get_mwt_expansions(evaluation=True)) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_f = scorer.score(system_pred_file, gold_file) logger.info("Dev F1 = {:.2f}".format(dev_f * 100)) if args.get('dict_only', False): # save dictionaries trainer.save(model_file) else: # train a seq2seq model logger.info("Training seq2seq-based MWT expander...") global_step = 0 max_steps = len(train_batch) * args['num_epoch'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' # start training for epoch in range(1, args['num_epoch'] + 1): train_loss = 0 for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\ max_steps, epoch, args['num_epoch'], loss, duration, current_lr)) # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for i, batch in enumerate(dev_batch): preds = trainer.predict(batch) dev_preds += preds if args.get('ensemble_dict', False) and args.get( 'ensemble_early_stop', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / train_batch.num_examples * args[ 'batch_size'] # avg loss per batch logger.info( "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format( epoch, train_loss, dev_score)) # save best model if epoch == 1 or dev_score > max(dev_score_history): trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds # lr schedule if epoch > args['decay_epoch'] and dev_score <= dev_score_history[ -1]: current_lr *= args['lr_decay'] trainer.change_lr(current_lr) dev_score_history += [dev_score] logger.info("Training ended with {} epochs.".format(epoch)) best_f, best_epoch = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at epoch = {}".format( best_f, best_epoch)) # try ensembling with dict if necessary if args.get('ensemble_dict', False): logger.info("[Ensembling dict with seq2seq model...]") dev_preds = trainer.ensemble( dev_batch.doc.get_mwt_expansions(evaluation=True), best_dev_preds) doc = copy.deepcopy(dev_batch.doc) doc.set_mwt_expansions(dev_preds) CoNLL.write_doc2conll(doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100)) best_f = max(best_f, dev_score)
def train(args): model_file = model_file_name(args) utils.ensure_dir(os.path.split(model_file)[0]) # load pretrained vectors if needed pretrain = load_pretrain(args) # load data logger.info("Loading data with batch size {}...".format( args['batch_size'])) train_data, _ = CoNLL.conll2dict(input_file=args['train_file']) # possibly augment the training data with some amount of fake data # based on the options chosen logger.info("Original data size: {}".format(len(train_data))) train_data.extend( augment_punct(train_data, args['augment_nopunct'], keep_original_sentences=False)) logger.info("Augmented data size: {}".format(len(train_data))) train_doc = Document(train_data) train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False) vocab = train_batch.vocab dev_doc = CoNLL.conll2doc(input_file=args['eval_file']) dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True) # pred and gold path system_pred_file = args['output_file'] gold_file = args['gold_file'] # skip training if the language does not have training or dev data if len(train_batch) == 0 or len(dev_batch) == 0: logger.info("Skip training because no data available...") sys.exit(0) logger.info("Training parser...") trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda']) global_step = 0 max_steps = args['max_steps'] dev_score_history = [] best_dev_preds = [] current_lr = args['lr'] global_start_time = time.time() format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}' using_amsgrad = False last_best_step = 0 # start training train_loss = 0 while True: do_break = False for i, batch in enumerate(train_batch): start_time = time.time() global_step += 1 loss = trainer.update(batch, eval=False) # update step train_loss += loss if global_step % args['log_step'] == 0: duration = time.time() - start_time logger.info( format_str.format(global_step, max_steps, loss, duration, current_lr)) if global_step % args['eval_interval'] == 0: # eval on dev logger.info("Evaluating on dev set...") dev_preds = [] for batch in dev_batch: preds = trainer.predict(batch) dev_preds += preds dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx) dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x]) CoNLL.write_doc2conll(dev_batch.doc, system_pred_file) _, _, dev_score = scorer.score(system_pred_file, gold_file) train_loss = train_loss / args[ 'eval_interval'] # avg loss per batch logger.info( "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format( global_step, train_loss, dev_score)) train_loss = 0 # save best model if len(dev_score_history ) == 0 or dev_score > max(dev_score_history): last_best_step = global_step trainer.save(model_file) logger.info("new best model saved.") best_dev_preds = dev_preds dev_score_history += [dev_score] if global_step - last_best_step >= args['max_steps_before_stop']: if not using_amsgrad: logger.info("Switching to AMSGrad") last_best_step = global_step using_amsgrad = True trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6) else: do_break = True break if global_step >= args['max_steps']: do_break = True break if do_break: break train_batch.reshuffle() logger.info("Training ended with {} steps.".format(global_step)) best_f, best_eval = max(dev_score_history) * 100, np.argmax( dev_score_history) + 1 logger.info("Best dev F1 = {:.2f}, at iteration = {}".format( best_f, best_eval * args['eval_interval']))
'--pretrain', type=str, default="/home/john/extern_data/wordvec/glove/armenian.pt", help='Which pretrain to use') parser.set_defaults(treebanks=[ "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Western_Armenian-ArmTDP/hyw_armtdp-ud-train.conllu", "/home/john/extern_data/ud2/ud-treebanks-v2.7/UD_Armenian-ArmTDP/hy_armtdp-ud-train.conllu" ]) args = parser.parse_args() return args args = parse_args() pt = pretrain.Pretrain(args.pretrain) pt.load() print("Pretrain stats: {} vectors, {} dim".format(len(pt.vocab), pt.emb[0].shape[0])) for treebank in args.treebanks: print(treebank) found = 0 total = 0 doc = CoNLL.conll2doc(treebank) for sentence in doc.sentences: for word in sentence.words: total = total + 1 if word.text in pt.vocab: found = found + 1 print(found / total)