Пример #1
0
def train(args, model_save_file, model_load_file, model_save_latest_file,
          retag_pipeline):
    """
    Build a model, train it using the requested train & dev files
    """
    print_args(args)

    utils.ensure_dir(args['save_dir'])

    train_trees = read_treebank(args['train_file'])
    logger.info("Read %d trees for the training set", len(train_trees))

    dev_trees = read_treebank(args['eval_file'])
    logger.info("Read %d trees for the dev set", len(dev_trees))

    if retag_pipeline is not None:
        logger.info("Retagging trees using the %s tags from the %s package...",
                    args['retag_method'], args['retag_package'])
        train_trees = retag_trees(train_trees, retag_pipeline,
                                  args['retag_xpos'])
        dev_trees = retag_trees(dev_trees, retag_pipeline, args['retag_xpos'])
        logger.info("Retagging finished")

    pt = load_pretrain(args)
    forward_charlm = load_charlm(args['charlm_forward_file'])
    backward_charlm = load_charlm(args['charlm_backward_file'])

    trainer, train_sequences, train_transitions = build_trainer(
        args, train_trees, dev_trees, pt, forward_charlm, backward_charlm)

    iterate_training(trainer, train_trees, train_sequences, train_transitions,
                     dev_trees, args, model_save_file, model_save_latest_file)
Пример #2
0
def run_dev_set(model, dev_trees, args):
    """
    This reparses a treebank and executes the CoreNLP Java EvalB code.

    It only works if CoreNLP 4.3.0 or higher is in the classpath.
    """
    logger.info("Processing %d trees from %s", len(dev_trees),
                args['eval_file'])
    model.eval()

    tree_iterator = iter(tqdm(dev_trees))
    treebank = parse_sentences(tree_iterator, build_batch_from_trees,
                               args['eval_batch_size'], model)

    if len(treebank) < len(dev_trees):
        logger.warning("Only evaluating %d trees instead of %d", len(treebank),
                       len(dev_trees))

    if args['mode'] == 'predict' and args['predict_file']:
        utils.ensure_dir(args['predict_dir'], verbose=False)
        pred_file = os.path.join(args['predict_dir'],
                                 args['predict_file'] + ".pred.mrg")
        orig_file = os.path.join(args['predict_dir'],
                                 args['predict_file'] + ".orig.mrg")
        if os.path.exists(pred_file):
            logger.warning(
                "Cowardly refusing to overwrite {}".format(pred_file))
        elif os.path.exists(orig_file):
            logger.warning(
                "Cowardly refusing to overwrite {}".format(orig_file))
        else:
            with open(pred_file, 'w') as fout:
                for tree in treebank:
                    fout.write(str(tree[1][0][0]))
                    fout.write("\n")

            with open(orig_file, 'w') as fout:
                for tree in treebank:
                    fout.write(str(tree[0]))
                    fout.write("\n")

    with EvaluateParser() as evaluator:
        response = evaluator.process(treebank)
        return response.f1
Пример #3
0
def main(args=None):
    args = parse_args(args=args)

    if args.cpu:
        args.cuda = False
    utils.set_random_seed(args.seed, args.cuda)

    args = vars(args)
    logger.info("Running tokenizer in {} mode".format(args['mode']))

    args['feat_funcs'] = ['space_before', 'capitalized', 'all_caps', 'numeric']
    args['feat_dim'] = len(args['feat_funcs'])
    args['save_name'] = "{}/{}".format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
            else '{}/{}_tokenizer.pt'.format(args['save_dir'], args['shorthand'])
    utils.ensure_dir(args['save_dir'])

    if args['mode'] == 'train':
        train(args)
    else:
        evaluate(args)
Пример #4
0
def main():
    args = parse_args()

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cpu:
        args.cuda = False
    elif args.cuda:
        torch.cuda.manual_seed(args.seed)

    args = vars(args)
    logger.info("Running {} character-level language model in {} mode".format(
        args['direction'], args['mode']))

    utils.ensure_dir(args['save_dir'])

    if args['mode'] == 'train':
        train(args)
    else:
        evaluate(args)
Пример #5
0
def main():
    args = parse_args()

    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    if args.cpu:
        args.cuda = False
    elif args.cuda:
        torch.cuda.manual_seed(args.seed)

    args = vars(args)
    print("Running tokenizer in {} mode".format(args['mode']))

    args['feat_funcs'] = ['space_before', 'capitalized', 'all_caps', 'numeric']
    args['feat_dim'] = len(args['feat_funcs'])
    args['save_name'] = "{}/{}".format(args['save_dir'], args['save_name']) if args['save_name'] is not None \
            else '{}/{}_tokenizer.pt'.format(args['save_dir'], args['shorthand'])
    utils.ensure_dir(args['save_dir'])

    if args['mode'] == 'train':
        train(args)
    else:
        evaluate(args)
Пример #6
0
def main(args=None):
    args = parse_args(args=args)

    if args.cpu:
        args.cuda = False
    utils.set_random_seed(args.seed, args.cuda)

    args = vars(args)
    logger.info("Running tokenizer in {} mode".format(args['mode']))

    args['feat_funcs'] = [
        'space_before', 'capitalized', 'numeric', 'end_of_para',
        'start_of_para'
    ]
    args['feat_dim'] = len(args['feat_funcs'])
    save_name = args['save_name'] if args[
        'save_name'] else '{}_tokenizer.pt'.format(args['shorthand'])
    args['save_name'] = os.path.join(args['save_dir'], save_name)
    utils.ensure_dir(args['save_dir'])

    if args['mode'] == 'train':
        train(args)
    else:
        evaluate(args)
Пример #7
0
def train(args):
    # load data
    logger.debug('max_dec_len: %d' % args['max_dec_len'])
    logger.debug("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = CoNLL.conll2doc(input_file=args['train_file'])
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab.size
    dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['save_dir'])
    save_name = args['save_name'] if args[
        'save_name'] else '{}_mwt_expander.pt'.format(args['shorthand'])
    model_file = os.path.join(args['save_dir'], save_name)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.warning("Skip training because no data available...")
        return

    # train a dictionary-based MWT expander
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    logger.info("Training dictionary-based MWT expander...")
    trainer.train_dict(train_batch.doc.get_mwt_expansions(evaluation=False))
    logger.info("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(
        dev_batch.doc.get_mwt_expansions(evaluation=True))
    doc = copy.deepcopy(dev_batch.doc)
    doc.set_mwt_expansions(dev_preds)
    CoNLL.write_doc2conll(doc, system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    logger.info("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        logger.info("Training seq2seq-based MWT expander...")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                                                  max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            logger.info("Evaluating on dev set...")
            dev_preds = []
            for i, batch in enumerate(dev_batch):
                preds = trainer.predict(batch)
                dev_preds += preds
            if args.get('ensemble_dict', False) and args.get(
                    'ensemble_early_stop', False):
                logger.info("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(
                    dev_batch.doc.get_mwt_expansions(evaluation=True),
                    dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.write_doc2conll(doc, system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            logger.info(
                "epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                    epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                logger.info("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[
                    -1]:
                current_lr *= args['lr_decay']
                trainer.change_lr(current_lr)

            dev_score_history += [dev_score]

        logger.info("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        logger.info("Best dev F1 = {:.2f}, at epoch = {}".format(
            best_f, best_epoch))

        # try ensembling with dict if necessary
        if args.get('ensemble_dict', False):
            logger.info("[Ensembling dict with seq2seq model...]")
            dev_preds = trainer.ensemble(
                dev_batch.doc.get_mwt_expansions(evaluation=True),
                best_dev_preds)
            doc = copy.deepcopy(dev_batch.doc)
            doc.set_mwt_expansions(dev_preds)
            CoNLL.write_doc2conll(doc, system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)
            logger.info("Ensemble dev F1 = {:.2f}".format(dev_score * 100))
            best_f = max(best_f, dev_score)
Пример #8
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors
    if len(args['wordvec_file']) == 0:
        vec_file = utils.get_wordvec_file(args['wordvec_dir'],
                                          args['shorthand'])
    else:
        vec_file = args['wordvec_file']
    # do not save pretrained embeddings individually
    pretrain = Pretrain(None,
                        vec_file,
                        args['pretrain_max_vocab'],
                        save_to_file=False)

    if args['charlm']:
        if args['charlm_shorthand'] is None:
            logger.info(
                "CharLM Shorthand is required for loading pretrained CharLM model..."
            )
            sys.exit(0)
        logger.info('Use pretrained contextualized char embedding')
        args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(
            args['charlm_save_dir'], args['charlm_shorthand'])
        args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(
            args['charlm_save_dir'], args['charlm_shorthand'])

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = Document(json.load(open(args['train_file'])))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(json.load(open(args['eval_file'])))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True)
    dev_gold_tags = dev_batch.tags

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training tagger...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])
    logger.info(trainer.model)

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = trainer.optimizer.param_groups[0]['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    # LR scheduling
    if args['lr_decay'] > 0:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \
            patience=args['patience'], verbose=True, min_lr=args['min_lr'])
    else:
        scheduler = None

    # start training
    train_loss = 0
    while True:
        should_stop = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                _, _, dev_score = scorer.score_by_entity(
                    dev_preds, dev_gold_tags)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    trainer.save(model_file)
                    logger.info("New best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                logger.info("")

                # lr schedule
                if scheduler is not None:
                    scheduler.step(dev_score)

            # check stopping
            current_lr = trainer.optimizer.param_groups[0]['lr']
            if global_step >= args['max_steps'] or current_lr <= args['min_lr']:
                should_stop = True
                break

        if should_stop:
            break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history) * 100, np.argmax(
        dev_score_history) + 1
    logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
        best_f, best_eval * args['eval_interval']))
Пример #9
0
def train(args):
    # load data
    print("[Loading data with batch size {}...]".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             evaluation=False)
    vocab = train_batch.vocab
    args['vocab_size'] = vocab['char'].size
    args['pos_vocab_size'] = vocab['pos'].size
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           vocab=vocab,
                           evaluation=True)

    utils.ensure_dir(args['model_dir'])
    model_file = '{}/{}_lemmatizer.pt'.format(args['model_dir'], args['lang'])

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    utils.print_config(args)

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("[Skip training because no data available...]")
        sys.exit(0)

    # start training
    # train a dictionary-based lemmatizer
    trainer = Trainer(args=args, vocab=vocab, use_cuda=args['cuda'])
    print("[Training dictionary-based lemmatizer...]")
    trainer.train_dict(train_batch.doc.get([TEXT, UPOS, LEMMA]))
    print("Evaluating on dev set...")
    dev_preds = trainer.predict_dict(dev_batch.doc.get([TEXT, UPOS]))
    dev_batch.doc.set([LEMMA], dev_preds)
    CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
    _, _, dev_f = scorer.score(system_pred_file, gold_file)
    print("Dev F1 = {:.2f}".format(dev_f * 100))

    if args.get('dict_only', False):
        # save dictionaries
        trainer.save(model_file)
    else:
        # train a seq2seq model
        print("[Training seq2seq-based lemmatizer...]")
        global_step = 0
        max_steps = len(train_batch) * args['num_epoch']
        dev_score_history = []
        best_dev_preds = []
        current_lr = args['lr']
        global_start_time = time.time()
        format_str = '{}: step {}/{} (epoch {}/{}), loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

        # start training
        for epoch in range(1, args['num_epoch'] + 1):
            train_loss = 0
            for i, batch in enumerate(train_batch):
                start_time = time.time()
                global_step += 1
                loss = trainer.update(batch, eval=False)  # update step
                train_loss += loss
                if global_step % args['log_step'] == 0:
                    duration = time.time() - start_time
                    print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                            max_steps, epoch, args['num_epoch'], loss, duration, current_lr))

            # eval on dev
            print("Evaluating on dev set...")
            dev_preds = []
            dev_edits = []
            for i, batch in enumerate(dev_batch):
                preds, edits = trainer.predict(batch, args['beam_size'])
                dev_preds += preds
                if edits is not None:
                    dev_edits += edits
            dev_preds = trainer.postprocess(dev_batch.doc.get([TEXT]),
                                            dev_preds,
                                            edits=dev_edits)

            # try ensembling with dict if necessary
            if args.get('ensemble_dict', False):
                print("[Ensembling dict with seq2seq model...]")
                dev_preds = trainer.ensemble(dev_batch.doc.get([TEXT, UPOS]),
                                             dev_preds)
            dev_batch.doc.set([LEMMA], dev_preds)
            CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
            _, _, dev_score = scorer.score(system_pred_file, gold_file)

            train_loss = train_loss / train_batch.num_examples * args[
                'batch_size']  # avg loss per batch
            print("epoch {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                epoch, train_loss, dev_score))

            # save best model
            if epoch == 1 or dev_score > max(dev_score_history):
                trainer.save(model_file)
                print("new best model saved.")
                best_dev_preds = dev_preds

            # lr schedule
            if epoch > args['decay_epoch'] and dev_score <= dev_score_history[-1] and \
                    args['optim'] in ['sgd', 'adagrad']:
                current_lr *= args['lr_decay']
                trainer.update_lr(current_lr)

            dev_score_history += [dev_score]
            print("")

        print("Training ended with {} epochs.".format(epoch))

        best_f, best_epoch = max(dev_score_history) * 100, np.argmax(
            dev_score_history) + 1
        print("Best dev F1 = {:.2f}, at epoch = {}".format(best_f, best_epoch))
Пример #10
0
def train(args):
    model_file = model_file_name(args)
    utils.ensure_dir(os.path.split(model_file)[0])

    # load pretrained vectors if needed
    pretrain = load_pretrain(args)

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    train_data, _ = CoNLL.conll2dict(input_file=args['train_file'])
    # possibly augment the training data with some amount of fake data
    # based on the options chosen
    logger.info("Original data size: {}".format(len(train_data)))
    train_data.extend(
        augment_punct(train_data,
                      args['augment_nopunct'],
                      keep_original_sentences=False))
    logger.info("Augmented data size: {}".format(len(train_data)))
    train_doc = Document(train_data)
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = CoNLL.conll2doc(input_file=args['eval_file'])
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True,
                           sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training parser...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = 'Finished STEP {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(
                    format_str.format(global_step, max_steps, loss, duration,
                                      current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL],
                                  [y for x in dev_preds for y in x])
                CoNLL.write_doc2conll(dev_batch.doc, system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    logger.info("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    logger.info("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(),
                                                   amsgrad=True,
                                                   lr=args['lr'],
                                                   betas=(.9, args['beta2']),
                                                   eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history) * 100, np.argmax(
        dev_score_history) + 1
    logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
        best_f, best_eval * args['eval_interval']))
Пример #11
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors if needed
    pretrain = None
    if args['pretrain']:
        vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training parser...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
Пример #12
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = os.path.join(args['save_dir'], args['save_name']) if args['save_name'] is not None \
        else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand'])

    pretrain = None
    vocab = None
    trainer = None

    if args['finetune'] and args['finetune_load_name']:
        logger.warning('Finetune is ON. Using model from "{}"'.format(args['finetune_load_name']))
        _, trainer, vocab = load_model(args, args['finetune_load_name'])
    elif args['finetune'] and os.path.exists(model_file):
        logger.warning('Finetune is ON. Using model from "{}"'.format(model_file))
        _, trainer, vocab = load_model(args, model_file)
    else:
        if args['finetune']:
            raise FileNotFoundError('Finetune is set to true but model file is not found: {}'.format(model_file))

        # load pretrained vectors
        if args['wordvec_pretrain_file']:
            pretrain_file = args['wordvec_pretrain_file']
            pretrain = Pretrain(pretrain_file, None, args['pretrain_max_vocab'], save_to_file=False)
        else:
            if len(args['wordvec_file']) == 0:
                vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
            else:
                vec_file = args['wordvec_file']
            # do not save pretrained embeddings individually
            pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False)

        if pretrain is not None:
            word_emb_dim = pretrain.emb.shape[1]
            if args['word_emb_dim'] and args['word_emb_dim'] != word_emb_dim:
                logger.warning("Embedding file has a dimension of {}.  Model will be built with that size instead of {}".format(word_emb_dim, args['word_emb_dim']))
            args['word_emb_dim'] = word_emb_dim

        if args['charlm']:
            if args['charlm_shorthand'] is None:
                raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...")
            logger.info('Using pretrained contextualized char embedding')
            if not args['charlm_forward_file']:
                args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])
            if not args['charlm_backward_file']:
                args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])

    # load data
    logger.info("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(json.load(open(args['train_file'])))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(json.load(open(args['eval_file'])))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True)
    dev_gold_tags = dev_batch.tags

    if args['finetune']:
        utils.warn_missing_tags([i for i in trainer.vocab['tag']], train_batch.tags, "training set")
    utils.warn_missing_tags(train_batch.tags, dev_batch.tags, "dev set")

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training tagger...")
    if trainer is None: # init if model was not loaded previously from file
        trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'],
                          train_classifier_only=args['train_classifier_only'])
    logger.info(trainer.model)

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = trainer.optimizer.param_groups[0]['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    # LR scheduling
    if args['lr_decay'] > 0:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \
            patience=args['patience'], verbose=True, min_lr=args['min_lr'])
    else:
        scheduler = None

    # start training
    train_loss = 0
    while True:
        should_stop = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                _, _, dev_score = scorer.score_by_entity(dev_preds, dev_gold_tags)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                logger.info("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    trainer.save(model_file)
                    logger.info("New best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                logger.info("")

                # lr schedule
                if scheduler is not None:
                    scheduler.step(dev_score)
            
            # check stopping
            current_lr = trainer.optimizer.param_groups[0]['lr']
            if global_step >= args['max_steps'] or current_lr <= args['min_lr']:
                should_stop = True
                break

        if should_stop:
            break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    if len(dev_score_history) > 0:
        best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
        logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
Пример #13
0
def main():
    args = parse_args()
    seed = utils.set_random_seed(args.seed, args.cuda)
    logger.info("Using random seed: %d" % seed)

    utils.ensure_dir(args.save_dir)

    # TODO: maybe the dataset needs to be in a torch data loader in order to
    # make cuda operations faster
    if args.train:
        train_set = read_dataset(args.train_file, args.wordvec_type,
                                 args.min_train_len)
        logger.info("Using training set: %s" % args.train_file)
        logger.info("Training set has %d labels" %
                    len(dataset_labels(train_set)))
    elif not args.load_name:
        raise ValueError(
            "No model provided and not asked to train a model.  This makes no sense"
        )
    else:
        train_set = None

    pretrain = load_pretrain(args)

    if args.load_name:
        model = cnn_classifier.load(args.load_name, pretrain)
    else:
        assert train_set is not None
        labels = dataset_labels(train_set)
        extra_vocab = dataset_vocab(train_set)
        model = cnn_classifier.CNNClassifier(pretrain.emb, pretrain.vocab,
                                             extra_vocab, labels, args)

    if args.cuda:
        model.cuda()

    logger.info("Filter sizes: %s" % str(model.config.filter_sizes))
    logger.info("Filter channels: %s" % str(model.config.filter_channels))
    logger.info("Intermediate layers: %s" % str(model.config.fc_shapes))

    save_name = args.save_name
    if not (save_name):
        save_name = args.base_name + "_" + args.shorthand + "_"
        save_name = save_name + "FS_%s_" % "_".join(
            [str(x) for x in model.config.filter_sizes])
        save_name = save_name + "C_%d_" % model.config.filter_channels
        if model.config.fc_shapes:
            save_name = save_name + "FC_%s_" % "_".join(
                [str(x) for x in model.config.fc_shapes])
        save_name = save_name + "classifier.pt"
    model_file = os.path.join(args.save_dir, save_name)

    if args.train:
        print_args(args)

        dev_set = read_dataset(args.dev_file, args.wordvec_type, min_len=None)
        logger.info("Using dev set: %s" % args.dev_file)
        check_labels(model.labels, dev_set)

        train_model(model, model_file, args, train_set, dev_set, model.labels)

    test_set = read_dataset(args.test_file, args.wordvec_type, min_len=None)
    logger.info("Using test set: %s" % args.test_file)
    check_labels(model.labels, test_set)

    if args.test_remap_labels is None:
        confusion = confusion_dataset(model, test_set)
        logger.info("Confusion matrix:\n{}".format(
            format_confusion(confusion, model.labels)))
        correct, total = confusion_to_accuracy(confusion)
        logger.info("Macro f1: {}".format(confusion_to_macro_f1(confusion)))
    else:
        correct = score_dataset(
            model,
            test_set,
            remap_labels=args.test_remap_labels,
            forgive_unmapped_labels=args.forgive_unmapped_labels)
        total = len(test_set)
    logger.info("Test set: %d correct of %d examples.  Accuracy: %f" %
                (correct, total, correct / total))