Exemplo n.º 1
0
    def __init__(self, args=None, feature_config=None):
        if args is None:
            args = DEFAULT_PARSER_ARGS.copy()
        if feature_config is None:
            self.feature_config = FEATURE_CONFIG.copy()
        model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

        # load pretrain; note that we allow the pretrain_file to be non-existent
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                                   args['shorthand'])
        self.pretrain = Pretrain(pretrain_file)

        # load model
        print("Loading model from: {}".format(model_file))
        use_cuda = args['cuda'] and not args['cpu']
        self.trainer = Trainer(pretrain=self.pretrain,
                               model_file=model_file,
                               use_cuda=use_cuda)
        self.loaded_args, self.vocab = self.trainer.args, self.trainer.vocab
        self.batch_size = args['batch_size']

        # load config
        for k in args:
            if k.endswith('_dir') or k.endswith('_file') or k in [
                    'shorthand'
            ] or k == 'mode':
                self.loaded_args[k] = args[k]
Exemplo n.º 2
0
 def _set_up_model(self, config, use_gpu):
     # get pretrained word vectors
     self._pretrain = Pretrain(config['pretrain_path'])
     # set up trainer
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu)
Exemplo n.º 3
0
 def _set_up_model(self, config, use_gpu):
     # get pretrained word vectors
     self._pretrain = Pretrain(
         config['pretrain_path']) if 'pretrain_path' in config else None
     # set up trainer
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu)
     self._tqdm = 'tqdm' in config and config['tqdm']
Exemplo n.º 4
0
 def _set_up_model(self, config, use_gpu):
     # get pretrained word vectors
     self._pretrain = Pretrain(
         config['pretrain_path']) if 'pretrain_path' in config else None
     self._n_pred = config['n_pred'] if 'n_pred' in config else 1
     # set up trainer
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu,
                             n_pred=self._n_pred)
Exemplo n.º 5
0
def evaluate(args):
    # file paths
    system_pred_file = args['output_file']
    gold_file = args['gold_file']
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_tagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrain; note that we allow the pretrain_file to be non-existent
    pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'],
                                               args['shorthand'])
    pretrain = Pretrain(pretrain_file)

    # load model
    print("Loading model from: {}".format(model_file))
    use_cuda = args['cuda'] and not args['cpu']
    trainer = Trainer(pretrain=pretrain,
                      model_file=model_file,
                      use_cuda=use_cuda)
    loaded_args, vocab = trainer.args, trainer.vocab

    # load config
    for k in args:
        if k.endswith('_dir') or k.endswith('_file') or k in ['shorthand'
                                                              ] or k == 'mode':
            loaded_args[k] = args[k]

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    batch = DataLoader(doc,
                       args['batch_size'],
                       loaded_args,
                       pretrain,
                       vocab=vocab,
                       evaluation=True,
                       sort_during_eval=True)
    if len(batch) > 0:
        print("Start evaluation...")
        preds = []
        for i, b in enumerate(batch):
            preds += trainer.predict(b)
    else:
        # skip eval if dev data does not exist
        preds = []
    preds = utils.unsort(preds, batch.data_orig_idx)

    # write to file and score
    batch.doc.set([UPOS, XPOS, FEATS], [y for x in preds for y in x])
    CoNLL.dict2conll(batch.doc.to_dict(), system_pred_file)

    if gold_file is not None:
        _, _, score = scorer.score(system_pred_file, gold_file)

        print("Tagger score:")
        print("{} {:.2f}".format(args['shorthand'], score * 100))
Exemplo n.º 6
0
    def _set_up_model(self, config, use_gpu):
        # get pretrained word vectors
        self._pretrain = Pretrain(config['pretrain_path'])
        # set up model
        self._model = cnn_classifier.load(filename=config['model_path'],
                                          pretrain=self._pretrain)
        self._batch_size = config.get('batch_size', None)

        # TODO: move this call to load()
        if use_gpu:
            self._model.cuda()
Exemplo n.º 7
0
def load_pretrain(args):
    pretrain = None
    if args['pretrain']:
        if args['wordvec_pretrain_file']:
            pretrain_file = args['wordvec_pretrain_file']
        else:
            pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        if os.path.exists(pretrain_file):
            vec_file = None
        else:
            vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])
    return pretrain
Exemplo n.º 8
0
 def _set_up_model(self, config, use_gpu):
     self._pretrain = Pretrain(
         config['pretrain_path']) if 'pretrain_path' in config else None
     n_parses = config['n_parses'] if 'n_parses' in config else 3
     kalm_shuffle = config[
         'kalm_shuffle'] if 'kalm_shuffle' in config else False
     automatic_n_parses = config[
         'automatic_n_parses'] if 'automatic_n_parses' in config else False
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu,
                             n_parses=n_parses,
                             kalm_shuffle=kalm_shuffle,
                             automatic_n_parses=automatic_n_parses)
Exemplo n.º 9
0
 def _set_up_model(self, config, use_gpu):
     # get pretrained word vectors
     pretrain_path = config.get('pretrain_path', None)
     self._pretrain = Pretrain(pretrain_path) if pretrain_path else None
     # set up model
     charlm_forward_file = config.get('forward_charlm_path', None)
     charlm_backward_file = config.get('backward_charlm_path', None)
     self._model = trainer.Trainer.load(
         filename=config['model_path'],
         pt=self._pretrain,
         forward_charlm=trainer.load_charlm(charlm_forward_file),
         backward_charlm=trainer.load_charlm(charlm_backward_file),
         use_gpu=use_gpu)
     # batch size counted as sentences
     self._batch_size = config.get('batch_size',
                                   ConstituencyProcessor.DEFAULT_BATCH_SIZE)
Exemplo n.º 10
0
    def _set_up_model(self, config, use_gpu):
        # get pretrained word vectors
        pretrain_path = config.get('pretrain_path', None)
        self._pretrain = Pretrain(pretrain_path) if pretrain_path else None
        forward_charlm_path = config.get('forward_charlm_path', None)
        charmodel_forward = CharacterLanguageModel.load(forward_charlm_path, finetune=False) if forward_charlm_path else None
        backward_charlm_path = config.get('backward_charlm_path', None)
        charmodel_backward = CharacterLanguageModel.load(backward_charlm_path, finetune=False) if backward_charlm_path else None
        # set up model
        self._model = cnn_classifier.load(filename=config['model_path'],
                                          pretrain=self._pretrain,
                                          charmodel_forward=charmodel_forward,
                                          charmodel_backward=charmodel_backward)
        # batch size counted as words
        self._batch_size = config.get('batch_size', SentimentProcessor.DEFAULT_BATCH_SIZE)

        # TODO: move this call to load()
        if use_gpu:
            self._model.cuda()
Exemplo n.º 11
0
def load_pretrain(args):
    if args.wordvec_pretrain_file:
        pretrain_file = args.wordvec_pretrain_file
    elif args.wordvec_type:
        pretrain_file = '{}/{}.{}.pretrain.pt'.format(
            args.save_dir, args.shorthand, args.wordvec_type.name.lower())
    else:
        raise Exception(
            "TODO: need to get the wv type back from get_wordvec_file")

    logger.info("Looking for pretrained vectors in {}".format(pretrain_file))
    if os.path.exists(pretrain_file):
        vec_file = None
    elif args.wordvec_raw_file:
        vec_file = args.wordvec_raw_file
        logger.info("Pretrain not found.  Looking in {}".format(vec_file))
    else:
        vec_file = utils.get_wordvec_file(args.wordvec_dir, args.shorthand,
                                          args.wordvec_type.name.lower())
        logger.info("Pretrain not found.  Looking in {}".format(vec_file))
    pretrain = Pretrain(pretrain_file, vec_file, args.pretrain_max_vocab)
    logger.info("Embedding shape: %s" % str(pretrain.emb.shape))
    return pretrain
Exemplo n.º 12
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors
    if len(args['wordvec_file']) == 0:
        vec_file = utils.get_wordvec_file(args['wordvec_dir'],
                                          args['shorthand'])
    else:
        vec_file = args['wordvec_file']
    # do not save pretrained embeddings individually
    pretrain = Pretrain(None,
                        vec_file,
                        args['pretrain_max_vocab'],
                        save_to_file=False)

    if args['charlm']:
        if args['charlm_shorthand'] is None:
            logger.info(
                "CharLM Shorthand is required for loading pretrained CharLM model..."
            )
            sys.exit(0)
        logger.info('Use pretrained contextualized char embedding')
        args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(
            args['charlm_save_dir'], args['charlm_shorthand'])
        args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(
            args['charlm_save_dir'], args['charlm_shorthand'])

    # load data
    logger.info("Loading data with batch size {}...".format(
        args['batch_size']))
    train_doc = Document(json.load(open(args['train_file'])))
    train_batch = DataLoader(train_doc,
                             args['batch_size'],
                             args,
                             pretrain,
                             evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(json.load(open(args['eval_file'])))
    dev_batch = DataLoader(dev_doc,
                           args['batch_size'],
                           args,
                           pretrain,
                           vocab=vocab,
                           evaluation=True)
    dev_gold_tags = dev_batch.tags

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training tagger...")
    trainer = Trainer(args=args,
                      vocab=vocab,
                      pretrain=pretrain,
                      use_cuda=args['cuda'])
    logger.info(trainer.model)

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = trainer.optimizer.param_groups[0]['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    # LR scheduling
    if args['lr_decay'] > 0:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \
            patience=args['patience'], verbose=True, min_lr=args['min_lr'])
    else:
        scheduler = None

    # start training
    train_loss = 0
    while True:
        should_stop = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False)  # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                _, _, dev_score = scorer.score_by_entity(
                    dev_preds, dev_gold_tags)

                train_loss = train_loss / args[
                    'eval_interval']  # avg loss per batch
                logger.info(
                    "step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(
                        global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history
                       ) == 0 or dev_score > max(dev_score_history):
                    trainer.save(model_file)
                    logger.info("New best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                logger.info("")

                # lr schedule
                if scheduler is not None:
                    scheduler.step(dev_score)

            # check stopping
            current_lr = trainer.optimizer.param_groups[0]['lr']
            if global_step >= args['max_steps'] or current_lr <= args['min_lr']:
                should_stop = True
                break

        if should_stop:
            break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history) * 100, np.argmax(
        dev_score_history) + 1
    logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(
        best_f, best_eval * args['eval_interval']))
Exemplo n.º 13
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = args['save_dir'] + '/' + args['save_name'] if args['save_name'] is not None \
            else '{}/{}_parser.pt'.format(args['save_dir'], args['shorthand'])

    # load pretrained vectors if needed
    pretrain = None
    if args['pretrain']:
        vec_file = args['wordvec_file'] if args['wordvec_file'] else utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
        pretrain_file = '{}/{}.pretrain.pt'.format(args['save_dir'], args['shorthand'])
        pretrain = Pretrain(pretrain_file, vec_file, args['pretrain_max_vocab'])

    # load data
    print("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(CoNLL.conll2dict(input_file=args['train_file']))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(CoNLL.conll2dict(input_file=args['eval_file']))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True, sort_during_eval=True)

    # pred and gold path
    system_pred_file = args['output_file']
    gold_file = args['gold_file']

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        print("Skip training because no data available...")
        sys.exit(0)

    print("Training parser...")
    trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'])

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = args['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    using_amsgrad = False
    last_best_step = 0
    # start training
    train_loss = 0
    while True:
        do_break = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                print(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                print("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                dev_preds = utils.unsort(dev_preds, dev_batch.data_orig_idx)

                dev_batch.doc.set([HEAD, DEPREL], [y for x in dev_preds for y in x])
                CoNLL.dict2conll(dev_batch.doc.to_dict(), system_pred_file)
                _, _, dev_score = scorer.score(system_pred_file, gold_file)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                print("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    last_best_step = global_step
                    trainer.save(model_file)
                    print("new best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                print("")

            if global_step - last_best_step >= args['max_steps_before_stop']:
                if not using_amsgrad:
                    print("Switching to AMSGrad")
                    last_best_step = global_step
                    using_amsgrad = True
                    trainer.optimizer = optim.Adam(trainer.model.parameters(), amsgrad=True, lr=args['lr'], betas=(.9, args['beta2']), eps=1e-6)
                else:
                    do_break = True
                    break

            if global_step >= args['max_steps']:
                do_break = True
                break

        if do_break: break

        train_batch.reshuffle()

    print("Training ended with {} steps.".format(global_step))

    best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
    print("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
Exemplo n.º 14
0
 def _set_up_model(self, config, use_gpu):
     self._pretrain = Pretrain(
         config['pretrain_path']) if 'pretrain_path' in config else None
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu)
Exemplo n.º 15
0
def train(args):
    utils.ensure_dir(args['save_dir'])
    model_file = os.path.join(args['save_dir'], args['save_name']) if args['save_name'] is not None \
        else '{}/{}_nertagger.pt'.format(args['save_dir'], args['shorthand'])

    pretrain = None
    vocab = None
    trainer = None

    if args['finetune'] and args['finetune_load_name']:
        logger.warning('Finetune is ON. Using model from "{}"'.format(args['finetune_load_name']))
        _, trainer, vocab = load_model(args, args['finetune_load_name'])
    elif args['finetune'] and os.path.exists(model_file):
        logger.warning('Finetune is ON. Using model from "{}"'.format(model_file))
        _, trainer, vocab = load_model(args, model_file)
    else:
        if args['finetune']:
            raise FileNotFoundError('Finetune is set to true but model file is not found: {}'.format(model_file))

        # load pretrained vectors
        if args['wordvec_pretrain_file']:
            pretrain_file = args['wordvec_pretrain_file']
            pretrain = Pretrain(pretrain_file, None, args['pretrain_max_vocab'], save_to_file=False)
        else:
            if len(args['wordvec_file']) == 0:
                vec_file = utils.get_wordvec_file(args['wordvec_dir'], args['shorthand'])
            else:
                vec_file = args['wordvec_file']
            # do not save pretrained embeddings individually
            pretrain = Pretrain(None, vec_file, args['pretrain_max_vocab'], save_to_file=False)

        if pretrain is not None:
            word_emb_dim = pretrain.emb.shape[1]
            if args['word_emb_dim'] and args['word_emb_dim'] != word_emb_dim:
                logger.warning("Embedding file has a dimension of {}.  Model will be built with that size instead of {}".format(word_emb_dim, args['word_emb_dim']))
            args['word_emb_dim'] = word_emb_dim

        if args['charlm']:
            if args['charlm_shorthand'] is None:
                raise ValueError("CharLM Shorthand is required for loading pretrained CharLM model...")
            logger.info('Using pretrained contextualized char embedding')
            if not args['charlm_forward_file']:
                args['charlm_forward_file'] = '{}/{}_forward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])
            if not args['charlm_backward_file']:
                args['charlm_backward_file'] = '{}/{}_backward_charlm.pt'.format(args['charlm_save_dir'], args['charlm_shorthand'])

    # load data
    logger.info("Loading data with batch size {}...".format(args['batch_size']))
    train_doc = Document(json.load(open(args['train_file'])))
    train_batch = DataLoader(train_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=False)
    vocab = train_batch.vocab
    dev_doc = Document(json.load(open(args['eval_file'])))
    dev_batch = DataLoader(dev_doc, args['batch_size'], args, pretrain, vocab=vocab, evaluation=True)
    dev_gold_tags = dev_batch.tags

    if args['finetune']:
        utils.warn_missing_tags([i for i in trainer.vocab['tag']], train_batch.tags, "training set")
    utils.warn_missing_tags(train_batch.tags, dev_batch.tags, "dev set")

    # skip training if the language does not have training or dev data
    if len(train_batch) == 0 or len(dev_batch) == 0:
        logger.info("Skip training because no data available...")
        sys.exit(0)

    logger.info("Training tagger...")
    if trainer is None: # init if model was not loaded previously from file
        trainer = Trainer(args=args, vocab=vocab, pretrain=pretrain, use_cuda=args['cuda'],
                          train_classifier_only=args['train_classifier_only'])
    logger.info(trainer.model)

    global_step = 0
    max_steps = args['max_steps']
    dev_score_history = []
    best_dev_preds = []
    current_lr = trainer.optimizer.param_groups[0]['lr']
    global_start_time = time.time()
    format_str = '{}: step {}/{}, loss = {:.6f} ({:.3f} sec/batch), lr: {:.6f}'

    # LR scheduling
    if args['lr_decay'] > 0:
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(trainer.optimizer, mode='max', factor=args['lr_decay'], \
            patience=args['patience'], verbose=True, min_lr=args['min_lr'])
    else:
        scheduler = None

    # start training
    train_loss = 0
    while True:
        should_stop = False
        for i, batch in enumerate(train_batch):
            start_time = time.time()
            global_step += 1
            loss = trainer.update(batch, eval=False) # update step
            train_loss += loss
            if global_step % args['log_step'] == 0:
                duration = time.time() - start_time
                logger.info(format_str.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"), global_step,\
                        max_steps, loss, duration, current_lr))

            if global_step % args['eval_interval'] == 0:
                # eval on dev
                logger.info("Evaluating on dev set...")
                dev_preds = []
                for batch in dev_batch:
                    preds = trainer.predict(batch)
                    dev_preds += preds
                _, _, dev_score = scorer.score_by_entity(dev_preds, dev_gold_tags)

                train_loss = train_loss / args['eval_interval'] # avg loss per batch
                logger.info("step {}: train_loss = {:.6f}, dev_score = {:.4f}".format(global_step, train_loss, dev_score))
                train_loss = 0

                # save best model
                if len(dev_score_history) == 0 or dev_score > max(dev_score_history):
                    trainer.save(model_file)
                    logger.info("New best model saved.")
                    best_dev_preds = dev_preds

                dev_score_history += [dev_score]
                logger.info("")

                # lr schedule
                if scheduler is not None:
                    scheduler.step(dev_score)
            
            # check stopping
            current_lr = trainer.optimizer.param_groups[0]['lr']
            if global_step >= args['max_steps'] or current_lr <= args['min_lr']:
                should_stop = True
                break

        if should_stop:
            break

        train_batch.reshuffle()

    logger.info("Training ended with {} steps.".format(global_step))

    if len(dev_score_history) > 0:
        best_f, best_eval = max(dev_score_history)*100, np.argmax(dev_score_history)+1
        logger.info("Best dev F1 = {:.2f}, at iteration = {}".format(best_f, best_eval * args['eval_interval']))
Exemplo n.º 16
0
 def _set_up_model(self, config, use_gpu):
     self._pretagged = config.get('pretagged')
     self._pretrain = Pretrain(config['pretrain_path'])
     self._trainer = Trainer(pretrain=self.pretrain,
                             model_file=config['model_path'],
                             use_cuda=use_gpu)