Exemplo n.º 1
0
def create_model(embeddings, d_model, d_ff, num_heads, num_layers, rpr_k, d_k,
                 activation, checkpoint_name, device):
    if len(rpr_k) == 0 or rpr_k[0] < 1:
        rpr_k = [None]
    else:
        rpr_k = listify(rpr_k)
    logger.info("Creating tied encoder decoder model")
    hps = {
        "dsz": d_model,
        "hsz": d_model,
        "d_ff": d_ff,
        "dropout": 0.0,
        "num_heads": num_heads,
        "layers": num_layers,
        "encoder_type": "transformer",
        "decoder_type": "transformer",
        "src_lengths_key": "x_lengths",
        "d_k": d_k,
        "activation": activation,
        "rpr_k": rpr_k
    }
    model = TiedEmbeddingsSeq2SeqModel({'x': embeddings}, None, **hps)
    if checkpoint_name.endswith('npz'):
        load_transformer_seq2seq_npz(model, checkpoint_name)
    else:
        model.load_state_dict(
            torch.load(checkpoint_name, map_location=torch.device(device)))
    print(model)
    return model
Exemplo n.º 2
0
def register_model(cls, task, name=None):
    """Register a function as a plug-in"""
    if name is None:
        name = cls.__name__

    names = listify(name)

    if task not in BASELINE_MODELS:
        BASELINE_MODELS[task] = {}

    if task not in BASELINE_LOADERS:
        BASELINE_LOADERS[task] = {}

    if hasattr(cls, 'create'):

        def create(*args, **kwargs):
            return cls.create(*args, **kwargs)
    else:

        def create(*args, **kwargs):
            return cls(*args, **kwargs)

    for alias in names:
        if alias in BASELINE_MODELS[task]:
            raise Exception(
                'Error: attempt to re-define previously registered handler {} (old: {}, new: {}) for task {} in registry'
                .format(alias, BASELINE_MODELS[task], cls, task))

        BASELINE_MODELS[task][alias] = create

        if hasattr(cls, 'load'):
            BASELINE_LOADERS[task][alias] = cls.load
    return cls
Exemplo n.º 3
0
def fit(model_params, ts, vs, es=None, **kwargs):

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    epochs = int(kwargs.get('epochs', 20))
    model_file = get_model_file('seq2seq', 'pytorch', kwargs.get('basedir'))


    num_loader_workers = int(kwargs.get('num_loader_workers', 0))
    pin_memory = bool(kwargs.get('pin_memory', True))
    ts = DataLoader(ts, num_workers=num_loader_workers, batch_size=None, pin_memory=pin_memory)
    vs = DataLoader(vs, batch_size=None, pin_memory=pin_memory)
    es = DataLoader(es, batch_size=None, pin_memory=pin_memory) if es is not None else None

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'perplexity')
        early_stopping_cmp, best_metric = get_metric_cmp(early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        logger.info('Doing early stopping on [%s] with patience [%d]', early_stopping_metric, patience)

    reporting_fns = listify(kwargs.get('reporting', []))
    logger.info('reporting %s', reporting_fns)

    after_train_fn = kwargs.get('after_train_fn', None)
    trainer = create_trainer(model_params, **kwargs)

    last_improved = 0
    for epoch in range(epochs):
        trainer.train(ts, reporting_fns)

        if after_train_fn is not None:
            after_train_fn(trainer.model)

        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            logger.info('New best %.3f', best_metric)
            trainer.save(model_file)

        elif (epoch - last_improved) > patience:
            logger.info('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        logger.info('Best performance on %s: %.3f at epoch %d', early_stopping_metric, best_metric, last_improved)

    if es is not None:
        model = torch.load(model_file)
        trainer = Seq2SeqTrainerPyTorch(model, **kwargs)
        test_metrics = trainer.test(es, reporting_fns, phase='Test')
    return test_metrics
Exemplo n.º 4
0
def register_remote(cls, name=None):
    """Register a class as a plug-in"""
    if name is None:
        name = cls.__name__
    names = listify(name)
    for alias in names:
        if alias in BASELINE_REMOTES:
            raise Exception(
                'Error: attempt to re-define previously registered hander {} (old: {}, new: {}) in registry'
                .format(alias, BASELINE_REMOTES, cls))
        BASELINE_REMOTES[alias] = cls
    return cls
Exemplo n.º 5
0
    def init_stacked(self, input_dim: int, **kwargs) -> BaseLayer:
        """Produce a stacking operation that will be used in the model

        :param input_dim: The input dimension size
        :param kwargs: See below

        :keyword arguments:

        * *hsz* (``list``), defaults to nothing, in which case this function is pass-through
        * *stacked_name* (``str``) Optional override to stacking name

        :return: A stacking operation (or None)
        """
        hszs = listify(kwargs.get('hsz', []))
        if not hszs:
            return PassThru(input_dim)
        name = kwargs.get('stacked_name')
        return DenseStack(input_dim,
                          hszs,
                          pdrop_value=self.pdrop_value,
                          name=name)
Exemplo n.º 6
0
def create_lr_scheduler(**kwargs):
    """Create a learning rate scheduler.

    :Keyword Arguments:
      * *lr_scheduler_type* `str` or `list` The name of the learning rate scheduler
          if list then the first scheduler should be a warmup scheduler.
    """
    sched_type = kwargs.get("lr_scheduler_type")
    if sched_type is None:
        return None
    sched_type = listify(sched_type)
    if len(sched_type) == 2:
        warm = MEAD_LAYERS_LR_SCHEDULERS.get(sched_type[0])(**kwargs)
        assert isinstance(warm, WarmupLearningRateScheduler
                          ), "First LR Scheduler must be a warmup scheduler."
        rest = MEAD_LAYERS_LR_SCHEDULERS.get(sched_type[1])(**kwargs)
        return MEAD_LAYERS_LR_SCHEDULERS.get("composite")(warm=warm,
                                                          rest=rest,
                                                          **kwargs)
    Constructor = MEAD_LAYERS_LR_SCHEDULERS.get(sched_type[0])
    return Constructor(**kwargs)
Exemplo n.º 7
0
def fit(model_params, ts, vs, es, **kwargs):
    """
    Train a classifier using PyTorch
    :param model_params: The model to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs: See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) -- Stop after eval data is not improving. Default to True
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *optim* --
           Optimizer to use, defaults to `sgd`
        * *eta, lr* (``float``) --
           Learning rate, defaults to 0.01
        * *mom* (``float``) --
           Momentum (SGD only), defaults to 0.9 if optim is `sgd`
    :return:
    """
    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    verbose = kwargs.get(
        'verbose', {
            'console': kwargs.get('verbose_console', False),
            'file': kwargs.get('verbose_file', None)
        })
    epochs = int(kwargs.get('epochs', 20))
    model_file = get_model_file('classify', 'pytorch', kwargs.get('basedir'))
    output = kwargs.get('output')
    txts = kwargs.get('txts')

    num_loader_workers = int(kwargs.get('num_loader_workers', 0))
    pin_memory = bool(kwargs.get('pin_memory', True))

    if not isinstance(ts, DataLoader):
        ts = DataLoader(ts,
                        num_workers=num_loader_workers,
                        batch_size=None,
                        pin_memory=pin_memory)
    if not isinstance(vs, DataLoader):
        vs = DataLoader(vs, batch_size=None, pin_memory=pin_memory)
    if es and not isinstance(es, DataLoader):
        es = DataLoader(es, batch_size=None, pin_memory=pin_memory)

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'acc')
        early_stopping_cmp, best_metric = get_metric_cmp(
            early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        logger.info('Doing early stopping on [%s] with patience [%d]',
                    early_stopping_metric, patience)

    reporting_fns = listify(kwargs.get('reporting', []))
    logger.info('reporting %s', reporting_fns)
    trainer = create_trainer(model_params, **kwargs)

    last_improved = 0

    for epoch in range(epochs):
        trainer.train(ts, reporting_fns)
        test_metrics = trainer.test(vs, reporting_fns)

        if do_early_stopping is False:
            trainer.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric],
                                best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            logger.info('New best %.3f', best_metric)
            trainer.save(model_file)

        elif (epoch - last_improved) > patience:
            logger.info('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        logger.info('Best performance on %s: %.3f at epoch %d',
                    early_stopping_metric, best_metric, last_improved)

    if es is not None:
        logger.info('Reloading best checkpoint')
        model = torch.load(model_file)
        trainer = create_trainer(model, **kwargs)
        test_metrics = trainer.test(es,
                                    reporting_fns,
                                    phase='Test',
                                    verbose=verbose,
                                    output=output,
                                    txts=txts)
    return test_metrics
Exemplo n.º 8
0
def fit_eager_distributed(model_params, ts, vs, es=None, **kwargs):
    """
    Train an language model using TensorFlow with `tf.dataset`.  This
    is the default behavior for training.

    :param model_params: The model (or parameters to create the model) to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """

    epochs = int(kwargs.get('epochs', 5))
    patience = int(kwargs.get('patience', epochs))

    model_file = get_model_file('lm', 'tf', kwargs.get('basedir'))

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'avg_loss')
        early_stopping_cmp, best_metric = get_metric_cmp(
            early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' %
              (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    batchsz = kwargs['batchsz']
    test_batchsz = kwargs.get('test_batchsz', batchsz)
    tgt_key = model_params.get('tgt_key')

    train_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(ts))
    train_dataset = train_dataset.shuffle(buffer_size=SHUF_BUF_SZ)
    train_dataset = train_dataset.batch(batchsz, drop_remainder=True)
    train_dataset = train_dataset.prefetch(NUM_PREFETCH)

    valid_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(vs))
    valid_dataset = valid_dataset.batch(batchsz, drop_remainder=True)
    valid_dataset = valid_dataset.prefetch(NUM_PREFETCH)

    trainer = LanguageModelTrainerDistributedTf(model_params, **kwargs)
    train_dataset = trainer.distribute(train_dataset)
    valid_dataset = trainer.distribute(valid_dataset)

    last_improved = 0
    SET_TRAIN_FLAG(True)

    for epoch in range(epochs):

        trainer.train(train_dataset, reporting_fns, steps=len(ts))
        test_metrics = trainer.test(valid_dataset,
                                    reporting_fns,
                                    phase='Valid',
                                    steps=len(vs))

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric],
                                best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' %
              (early_stopping_metric, best_metric, last_improved))

    if es is not None:
        print('Reloading best checkpoint')
        trainer.recover_last_checkpoint()
        trainer.strategy = tf.distribute.OneDeviceStrategy('/device:GPU:0')
        test_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(es))
        test_dataset = test_dataset.batch(test_batchsz, drop_remainder=False)
        test_dataset = test_dataset.prefetch(NUM_PREFETCH)
        test_dataset = trainer.distribute(test_dataset)
        trainer.test(test_dataset, reporting_fns, phase='Test', steps=len(es))
Exemplo n.º 9
0
def fit(model_params, ts, vs, es, **kwargs):
    """
    Train a classifier using TensorFlow with a `feed_dict`.  This
    is the previous default behavior for training.  To use this, you need to pass
    `fit_func: feed_dict` in your MEAD config

    :param model_params: The model to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """
    epochs = int(kwargs.get('epochs', 5))
    patience = int(kwargs.get('patience', epochs))
    conll_output = kwargs.get('conll_output', None)
    span_type = kwargs.get('span_type', 'iob')
    txts = kwargs.get('txts', None)
    model_file = get_model_file('tagger', 'tf', kwargs.get('basedir'))
    TRAIN_FLAG()

    trainer = create_trainer(model_params, **kwargs)

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    verbose = bool(kwargs.get('verbose', False))

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'acc')
        early_stopping_cmp, best_metric = get_metric_cmp(
            early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' %
              (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    last_improved = 0
    for epoch in range(epochs):

        trainer.train(ts, reporting_fns)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric],
                                best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' %
              (early_stopping_metric, best_metric, last_improved))
    if es is not None:

        trainer.recover_last_checkpoint()
        # What to do about overloading this??
        evaluator = TaggerEvaluatorTf(trainer.model, span_type, verbose)
        timer = Timer()
        test_metrics = evaluator.test(es, conll_output=conll_output, txts=txts)
        duration = timer.elapsed()
        for reporting in reporting_fns:
            reporting(test_metrics, 0, 'Test')
        trainer.log.debug({'phase': 'Test', 'time': duration})
Exemplo n.º 10
0
def fit(model_params, ts, vs, es=None, **kwargs):
    """
    Train an language model using TensorFlow with a `feed_dict`.

    :param model_params: The model (or parameters to create the model) to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 5
        * *outfile* -- Model output file
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8
        * *after_train_fn* (`func`) -- A callback to fire after ever epoch of training

    :return: None
    """
    epochs = int(kwargs['epochs']) if 'epochs' in kwargs else 5
    patience = int(kwargs['patience']) if 'patience' in kwargs else epochs

    model_file = get_model_file('lm', 'tf', kwargs.get('basedir'))
    after_train_fn = kwargs['after_train_fn'] if 'after_train_fn' in kwargs else None
    trainer = create_trainer(model_params, **kwargs)

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))

    best_metric = 1000
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'avg_loss')
        early_stopping_cmp, best_metric = get_metric_cmp(early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' % (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    last_improved = 0

    for epoch in range(epochs):

        trainer.train(ts, reporting_fns)
        if after_train_fn is not None:
            after_train_fn(trainer.model)

        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' % (early_stopping_metric, best_metric, last_improved))
    if es is not None:
        trainer.recover_last_checkpoint()
        trainer.test(es, reporting_fns, phase='Test')
Exemplo n.º 11
0
def fit_datasets(model_params, ts, vs, es=None, **kwargs):
    """
    Train a tagger using TensorFlow with `tf.dataset`.  This
    is the default behavior for training.

    :param model_params: The model (or parameters to create the model) to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """
    conll_output = kwargs.get('conll_output', None)
    span_type = kwargs.get('span_type', 'iob')
    txts = kwargs.get('txts', None)
    model_file = get_model_file('tagger', 'tf', kwargs.get('basedir'))

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    verbose = kwargs.get('verbose', {'console': kwargs.get('verbose_console', False), 'file': kwargs.get('verbose_file', None)})
    epochs = int(kwargs.get('epochs', 20))

    batchsz = kwargs['batchsz']
    ## First, make tf.datasets for ts, vs and es
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md
    # effective_batch_sz = args.batchsz*args.gpus
    test_batchsz = kwargs.get('test_batchsz', batchsz)
    # This is a little awkward:
    lengths_key = model_params.get('lengths_key')

    train_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(ts, lengths_key))
    train_dataset = train_dataset.shuffle(buffer_size=SHUF_BUF_SZ)
    train_dataset = train_dataset.batch(batchsz, drop_remainder=False)
    train_dataset = train_dataset.repeat(epochs + 1)
    train_dataset = train_dataset.prefetch(NUM_PREFETCH)

    valid_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(vs, lengths_key))
    valid_dataset = valid_dataset.batch(batchsz, drop_remainder=False)
    valid_dataset = valid_dataset.repeat(epochs + 1)
    valid_dataset = valid_dataset.prefetch(NUM_PREFETCH)

    iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(train_dataset),
                                                     tf.compat.v1.data.get_output_shapes(train_dataset))

    features, y = iter.get_next()
    # Add features to the model params
    model_params.update(features)
    model_params['y'] = y
    # create the initialisation operations
    train_init_op = iter.make_initializer(train_dataset)
    valid_init_op = iter.make_initializer(valid_dataset)

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'acc')
        early_stopping_cmp, best_metric = get_metric_cmp(early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' % (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    TRAIN_FLAG()
    trainer = create_trainer(model_params, **kwargs)

    last_improved = 0

    for epoch in range(epochs):
        trainer.sess.run(train_init_op)
        trainer.train(ts, reporting_fns)
        trainer.sess.run(valid_init_op)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' % (early_stopping_metric, best_metric, last_improved))

    if es is not None:
        print('Reloading best checkpoint')
        trainer.recover_last_checkpoint()

        test_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(es, lengths_key))
        test_dataset = test_dataset.batch(test_batchsz, drop_remainder=False)
        test_dataset = test_dataset.repeat(epochs + 1)
        test_dataset = test_dataset.prefetch(NUM_PREFETCH)
        test_init_op = iter.make_initializer(test_dataset)
        trainer.sess.run(test_init_op)
        # What to do about overloading this??
        evaluator = TaggerEvaluatorTf(trainer.model, span_type, verbose)
        start = time.time()
        test_metrics = evaluator.test(es, conll_output=conll_output, txts=txts)
        duration = time.time() - start
        for reporting in reporting_fns:
            reporting(test_metrics, 0, 'Test')
        trainer.log.debug({'phase': 'Test', 'time': duration})
Exemplo n.º 12
0
def load_embeddings_overlay(global_embeddings_settings,
                            embeddings_section,
                            vocab,
                            data_download_cache=DEFAULT_DATA_CACHE,
                            name=None):
    """Creates a set of arbitrary sub-graph, DL-framework-specific embeddings by delegating to wired sub-module.

    As part of this process, we take in an index of embeddings by name, a ``dict`` of ``Counter`` objects (keyed by
    feature name), containing the number of times each token has been seen, and a `features` list which is a
    sub-section of the mead config containing the `embeddings` section for each feature.
    This method's job is to either create a sub-graph from a pretrained model, or to create a new random
    initialized sub-graph, taking into account the input vocabulary counters.  The embeddings model has control
    to determine the actual word indices and sub-graph for the embeddings, both of which are returned from this
    method.  If some sort of feature selection is
    performed, such as low count removal that would be required via the delegated methods

    :param global_embeddings_settings: The embeddings index passed to mead driver
    :param vocabs: A set of known ``Counter``s for each vocabulary consisting of a token key and count for each
    :param features: The `features` sub-section of the mead config
    :return: Returns a ``tuple`` comprised of a ``dict`` of (`feature name`, `Embedding`) and an updated vocab
    """

    # Get the label out of the embeddings section in the features block of mead config
    embed_label = embeddings_section.get('label',
                                         embeddings_section.get('labels'))
    if name is None:
        name = embed_label
    # Get the type of embedding out of the embeddings section in the features block of mead config
    embed_type = embeddings_section.get('type', 'default')
    is_stacked = is_sequence(embed_label)
    if is_stacked:
        if embed_type != 'default':
            logger.warning(
                "You have requested a stack of pretrained embeddings but didnt request 'default' or representation"
            )
    # Backwards compat, copy from main block if not present locally
    embeddings_section['unif'] = embeddings_section.get('unif', 0.1)

    # Backwards compat, copy from main block if not present locally
    embeddings_section['keep_unused'] = embeddings_section.get(
        'keep_unused', False)

    # Overlay any backend parameters

    # Also, if we are in eager mode, we might have to place the embeddings explicitly on the CPU
    embeddings_section['cpu_placement'] = bool(
        embeddings_section.get('cpu_placement', False))
    if embed_label is not None:
        # Allow local overrides to uniform initializer

        embed_labels = listify(embed_label)

        embed_files = []
        for embed_label in embed_labels:

            embeddings_global_config_i = global_embeddings_settings[
                embed_label]
            if 'type' in embeddings_global_config_i:
                embed_type_i = embeddings_global_config_i['type']
                embed_type = embed_type_i
                if embed_type_i != 'default' and is_stacked:
                    raise Exception(
                        "Stacking embeddings only works for 'default' pretrained word embeddings"
                    )

            embed_file = embeddings_global_config_i.get('file')
            unzip_file = embeddings_global_config_i.get('unzip', True)
            embed_dsz = embeddings_global_config_i['dsz']
            embed_sha1 = embeddings_global_config_i.get('sha1')
            # Should we grab vocab here too?

            embed_model = embeddings_global_config_i.get('model', {})
            if 'dsz' not in embed_model and not is_stacked:
                embed_model['dsz'] = embed_dsz

            embeddings_section = {**embed_model, **embeddings_section}
            try:
                # We arent necessarily going to get an `embed_file`. For instance, using the HuggingFace
                # models in the Hub addon, the `embed_file` should be downloaded using HuggingFace's library,
                # not by us.  In this case we want it to be None and we dont want to download it
                if embed_file:
                    embed_file = EmbeddingDownloader(
                        embed_file,
                        embed_dsz,
                        embed_sha1,
                        data_download_cache,
                        unzip_file=unzip_file).download()
                    embed_files.append(embed_file)
                else:
                    embed_files.append(None)
            except Exception as e:
                if is_stacked:
                    raise e
                logger.warning(
                    f"We were not able to download {embed_file}, passing to the addon"
                )
                embed_files.append(embed_file)
        # If we have stacked embeddings (which only works with `default` model, we need to pass the list
        # If not, grab the first item
        embed_file = embed_files if is_stacked else embed_files[0]
        embedding_bundle = load_embeddings(
            name,
            embed_file=embed_file,
            known_vocab=vocab,
            embed_type=embed_type,
            data_download_cache=data_download_cache,
            **embeddings_section)

    else:  # if there is no label given, assume we need random initialization vectors
        dsz = embeddings_section.pop('dsz')
        embedding_bundle = load_embeddings(
            name,
            dsz=dsz,
            known_vocab=vocab,
            embed_type=embed_type,
            data_download_cache=data_download_cache,
            **embeddings_section)

    return embedding_bundle
Exemplo n.º 13
0
def load_embeddings(name, **kwargs):
    """This method negotiates loading an embeddings sub-graph AND a corresponding vocabulary (lookup from word to int)

    Embeddings and their addons may be downloaded from an http `GET` either via raw URL or using hub notation
    (hub:v1:embeddings/hub:v1:addons)

    This function behaves differently depending on its keyword arguments and the `embed_type`.
    If the registered embeddings class contains a load method on it and we are given an `embed_file`,
    we will assume that we need to load that file, and that the embeddings object wants its own load function used
    for that.  This would be typical, e.g, for a user-defined sub-graph LM.

    For cases where no `embed_file` is provided and there is a `create` method on this class, we  assume that the user
    wants us to build a VSM (`baseline.embeddings.PretrainedEmbeddingsModel`) ourselves, and call
    their create function, which will take in this VSM.

    The VSM is then used to provide the vocabulary back, and the `create` function invokes the class constructor
    with the sub-parts of VSM required to build the graph.

    If there is no create method provided, and there is no load function provided, we simply invoke the
    registered embeddings' constructor with the args, and assume there is a `get_vocab()` method on the
    provided implementation

    :param name: A unique string name for these embeddings
    :param kwargs:

    :Keyword Arguments:
    * *embed_type*  The key identifying the embedding type in the registry
    :return:
    """
    embed_type = kwargs.pop("embed_type", "default")
    # Dynamically load a module if its needed
    for module in listify(kwargs.get('module', kwargs.get('modules', []))):
        import_user_module(module, kwargs.get('data_download_cache'))
    embeddings_cls = MEAD_LAYERS_EMBEDDINGS[embed_type]

    filename = kwargs.get("embed_file")

    # If the embedding model has a load function, defer all the work to that.  Basically just pass the kwargs in
    # and let it do its magic
    if hasattr(embeddings_cls, "load") and filename is not None:
        model = embeddings_cls.load(filename, **kwargs)
        return {"embeddings": model, "vocab": model.get_vocab()}
    # If there isnt a load function, there must be a create() function where the first arg is a type of
    # EmbeddingsModel
    elif hasattr(embeddings_cls, "create"):
        unif = kwargs.pop("unif", 0.1)
        known_vocab = kwargs.pop("known_vocab", None)
        keep_unused = kwargs.pop("keep_unused", False)
        normalize = kwargs.pop("normalized", False)
        preserve_vocab_indices = bool(
            kwargs.get('preserve_vocab_indices', False))

        # if there is no filename, use random-init model
        if filename is None:
            dsz = kwargs.pop("dsz")
            model = RandomInitVecModel(dsz,
                                       known_vocab=known_vocab,
                                       unif_weight=unif,
                                       counts=not preserve_vocab_indices)
        # If there, is use the PretrainedEmbeddingsModel loader
        else:
            if is_sequence(filename) or preserve_vocab_indices:
                model = PretrainedEmbeddingsStack(
                    listify(filename),
                    known_vocab=known_vocab,
                    normalize=normalize,
                    counts=not preserve_vocab_indices,
                    **kwargs)
            else:
                model = PretrainedEmbeddingsModel(
                    filename,
                    known_vocab=known_vocab,
                    unif_weight=unif,
                    keep_unused=keep_unused,
                    normalize=normalize,
                    **kwargs,
                )

        # Then call create(model, name, **kwargs)
        return {
            "embeddings": embeddings_cls.create(model, name, **kwargs),
            "vocab": model.get_vocab()
        }
    # If we dont have a load function, but filename is none, we should just instantiate the class
    model = embeddings_cls(name, **kwargs)
    return {"embeddings": model, "vocab": model.get_vocab()}
Exemplo n.º 14
0
def fit(model, ts, vs, es, **kwargs):
    epochs = int(kwargs['epochs']) if 'epochs' in kwargs else 5
    patience = int(kwargs['patience']) if 'patience' in kwargs else epochs
    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    model_file = get_model_file('lm', 'pytorch', kwargs.get('basedir'))

    num_loader_workers = int(kwargs.get('num_loader_workers', 0))
    pin_memory = bool(kwargs.get('pin_memory', True))
    if not isinstance(ts, DataLoader):
        ts = DataLoader(ts,
                        num_workers=num_loader_workers,
                        batch_size=None,
                        pin_memory=pin_memory)
    if not isinstance(vs, DataLoader):
        vs = DataLoader(vs, batch_size=None, pin_memory=pin_memory)
    if es and not isinstance(es, DataLoader):
        es = DataLoader(es, batch_size=None, pin_memory=pin_memory)
    best_metric = 10000
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'avg_loss')
        early_stopping_cmp, best_metric = get_metric_cmp(
            early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        logger.info('Doing early stopping on [%s] with patience [%d]',
                    early_stopping_metric, patience)

    reporting_fns = listify(kwargs.get('reporting', []))
    logger.info('reporting %s', reporting_fns)

    after_train_fn = kwargs.get('after_train_fn', None)
    trainer = create_trainer(model, **kwargs)
    last_improved = 0
    for epoch in range(epochs):

        trainer.train(ts, reporting_fns)
        if after_train_fn is not None:
            after_train_fn(model)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric],
                                best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            logger.info('New best %.3f', best_metric)
            trainer.save(model_file)

        elif (epoch - last_improved) > patience:
            logger.info('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        logger.info('Best performance on %s: %.3f at epoch %d',
                    early_stopping_metric, best_metric, last_improved)

    if es is not None:
        logger.info('Reloading best checkpoint')
        model = torch.load(model_file)
        trainer = create_trainer(model, **kwargs)
        test_metrics = trainer.test(es, reporting_fns, phase='Test')
    return test_metrics