Пример #1
0
def transformer_encoder(x, src_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):

    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        if d_ff is None:
            d_ff = 4 * d_model
        x = layer_norm(x, 'ln_1')
        q, k, v = self_attention_qkv(x, d_model)
        a = multi_headed_attention(q, k, v, 'attn', d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        x = x + tf.layers.dropout(a, pdrop, training=TRAIN_FLAG())
        x = layer_norm(x, 'ln_2')
        m = ffn(x, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = x + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
Пример #2
0
    def pool(self, word_embeddings, dsz, init, **kwargs):
        """Do parallel convolutional filtering with varied receptive field widths, followed by max-over-time pooling

        :param word_embeddings: The word embeddings, which are inputs here
        :param dsz: The depth of the word embeddings
        :param init: The tensorflow initializer
        :param kwargs: See below

        :Keyword Arguments:
        * *cmotsz* -- (``int``) The number of convolutional feature maps for each filter
            These are MOT-filtered, leaving this # of units per parallel filter
        * *filtsz* -- (``list``) This is a list of filter widths to use

        :return:
        """
        cmotsz = kwargs['cmotsz']
        filtsz = kwargs['filtsz']

        combine, _ = parallel_conv(word_embeddings, filtsz, dsz, cmotsz)
        # Definitely drop out
        with tf.name_scope("dropout"):
            combine = tf.layers.dropout(combine,
                                        self.pdrop_value,
                                        training=TRAIN_FLAG())
        return combine
Пример #3
0
    def encode(self, x=None):
        """Build a simple Lookup Table and set as input `x` if it exists, or `self.x` otherwise.

        :param x: An optional input sub-graph to bind to this operation or use `self.x` if `None`
        :return: The sub-graph output
        """
        if x is None:
            x = LookupTableEmbeddings.create_placeholder(self.name)
        self.x = x
        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):

            W = tf.get_variable("W",
                                initializer=tf.constant_initializer(
                                    self.weights,
                                    dtype=tf.float32,
                                    verify_shape=True),
                                shape=[self.vsz, self.dsz],
                                trainable=self.finetune)
            e0 = tf.scatter_update(W, tf.constant(0, dtype=tf.int32,
                                                  shape=[1]),
                                   tf.zeros(shape=[1, self.dsz]))

            with tf.control_dependencies([e0]):
                embedding_w_dropout = tf.layers.dropout(W,
                                                        self.dropin,
                                                        noise_shape=(self.vsz,
                                                                     1),
                                                        training=TRAIN_FLAG())
                word_embeddings = tf.nn.embedding_lookup(
                    embedding_w_dropout, self.x)

        return word_embeddings
Пример #4
0
    def stacked(self, pooled, init, **kwargs):
        """Stack 1 or more hidden layers, optionally (forming an MLP)

        :param pooled: The fixed representation of the model
        :param init: The tensorflow initializer
        :param kwargs: See below

        :Keyword Arguments:
        * *hsz* -- (``int``) The number of hidden units (defaults to `100`)

        :return: The final layer
        """

        hszs = listify(kwargs.get('hsz', []))
        if len(hszs) == 0:
            return pooled

        in_layer = pooled
        for i, hsz in enumerate(hszs):
            fc = tf.layers.dense(in_layer,
                                 hsz,
                                 activation=tf.nn.relu,
                                 kernel_initializer=init,
                                 name='fc-{}'.format(i))
            in_layer = tf.layers.dropout(fc,
                                         self.pdrop_value,
                                         training=TRAIN_FLAG(),
                                         name='fc-dropout-{}'.format(i))
        return in_layer
Пример #5
0
 def _test(self, loader, dataset=True, **kwargs):
     epoch_probs = []
     epoch_y = []
     epoch_loss = 0
     epoch_norm = 0
     pg = create_progress_bar(len(loader))
     for batch_dict in pg(loader):
         if dataset:
             probs, lossv, y = self.sess.run(
                 [self.probs, self.loss, self.y],
                 feed_dict={TRAIN_FLAG(): 0}
             )
             batchsz = len(y)
         else:
             feed_dict = self.model.make_input(batch_dict, False)
             probs, lossv, y = self.sess.run(
                 [self.probs, self.loss, self.y],
                 feed_dict=feed_dict
             )
             batchsz = self._get_batchsz(batch_dict)
         epoch_loss += lossv * batchsz
         epoch_norm += batchsz
         epoch_probs.append(probs)
         epoch_y.append(y)
     probs = np.concatenate(epoch_probs, axis=0)
     y = np.argmax(np.concatenate(epoch_y, axis=0), axis=1)
     bins = multiclass_calibration_bins(y, probs, bins=int(kwargs.get('bins', 10)))
     metrics = {
         "ECE": expected_calibration_error(bins.accs, bins.confs, bins.counts),
         "MCE": maximum_calibration_error(bins.accs, bins.confs, bins.counts),
         "avg_loss": epoch_loss / float(epoch_norm)
     }
     return metrics
Пример #6
0
def ffn(x, scope, pdrop, d_ff, activation_type='relu'):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        act = tf_activation(activation_type)
        expansion = act(time_distributed_projection(x, name='ffn_ff', filters=d_ff))
        dropped = tf.layers.dropout(expansion, pdrop, training=TRAIN_FLAG())
        squeeze = time_distributed_projection(dropped, name='ffn_model', filters=d_model)
        return squeeze
Пример #7
0
def dot_product_attention(query, key, value, pdrop=0.0, mask=None, scale=True):
    w = tf.matmul(query, key, transpose_b=True)

    if scale:
        w *= tf.rsqrt(tf.to_float(tf.shape(query)[2]))

    if mask is not None:
        w = w * mask + -1e9 * (1 - mask)

    weights = tf.nn.softmax(w, name="attention_weights")
    weights = tf.layers.dropout(weights, pdrop, training=TRAIN_FLAG())
    return tf.matmul(weights, value)
Пример #8
0
def transformer_decoder(tgt, src, src_mask, tgt_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(tgt)[-1]
        if d_ff is None:
            d_ff = 4 * d_model

        tgt = layer_norm(tgt, 'ln_1')

        q, k, v = self_attention_qkv(tgt, d_model)
        self_attn = multi_headed_attention(q, k, v, 'self_attn', d_model, num_heads, pdrop, scale=scale, mask=tgt_mask)
        tgt = tgt + tf.layers.dropout(self_attn, pdrop, training=TRAIN_FLAG())
        tgt = layer_norm(tgt, 'ln_2')

        q, k, v = low_order_projection_qkv(tgt, src, src, d_model)
        # Mask at zeros???
        src_attn = multi_headed_attention(q, k, v, "dual_attn", d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        tgt = tgt + tf.layers.dropout(src_attn, pdrop, training=TRAIN_FLAG())

        tgt = layer_norm(tgt, 'ln_3')
        m = ffn(tgt, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = tgt + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
Пример #9
0
    def _train(self, loader, dataset=True, **kwargs):
        """Train an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.  We do use a `feed_dict` for passing the `TRAIN_FLAG` in either case

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
         * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
         * *reporting_fns* (`list`) A list of reporting hooks to use

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_restore)

        reporting_fns = kwargs.get('reporting_fns', [])
        epoch_loss = 0
        epoch_div = 0
        steps = len(loader)
        pg = create_progress_bar(steps)
        for batch_dict in pg(loader):
            if dataset:
                _, step, lossv = self.sess.run(
                    [self.train_op, self.global_step, self.loss],
                    feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, step, lossv = self.sess.run(
                    [self.train_op, self.global_step, self.loss],
                    feed_dict=feed_dict)

            batchsz = self._get_batchsz(batch_dict)
            report_lossv = lossv * batchsz
            epoch_loss += report_lossv
            epoch_div += batchsz
            self.nstep_agg += report_lossv
            self.nstep_div += batchsz

            if (step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(step + 1, metrics, self.nstep_start, 'Train',
                            'STEP', reporting_fns, self.nsteps)
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_div)
        return metrics
Пример #10
0
def fit(model_params, _, ts, vs=None, **kwargs):

    model_file = get_model_file('classify', 'tf', kwargs.get('basedir'))
    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    TRAIN_FLAG()
    trainer = create_trainer(model_params, **kwargs)

    test_metrics = trainer.test(vs, reporting_fns, phase="Test-Before", dataset=False)
    trainer.train(ts, reporting_fns, dataset=False)
    test_metrics = trainer.test(vs, reporting_fns, phase="Test", dataset=False)

    trainer.checkpoint()
    trainer.model.save(model_file)
Пример #11
0
    def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        epoch_loss = 0
        epoch_toks = 0

        start = time.time()
        self.nstep_start = start
        for batch_dict in ts:
            if dataset:
                _, global_step, lossv = self.sess.run(
                    [self.train_op, self.global_step, self.loss],
                    feed_dict={TRAIN_FLAG(): 1})
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, global_step, lossv = self.sess.run(
                    [self.train_op, self.global_step, self.loss],
                    feed_dict=feed_dict)

            # ?? How to get this cleaner?
            toks = self._num_toks(batch_dict['tgt_lengths'])
            report_loss = lossv * toks

            epoch_loss += report_loss
            epoch_toks += toks
            self.nstep_agg += report_loss
            self.nstep_div += toks

            if (global_step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(global_step + 1, metrics, self.nstep_start,
                            'Train', 'STEP', reporting_fns, self.nsteps)
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_toks)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics
Пример #12
0
    def encode(self, x=None):
        """Create a really large embedding matrix in tensorflow.

        Tensorflow has a limit on the size that a op can be (2GB). When we have very
        large embedding lookuptables (for example when we don't prune the vocab) we
        hit this limit and can't have the embeddings in the graph. This is due to a
        limit in the size that a thing can be in a protocol buffer (how tensorflow
        serializes the graph).

        Here we get around it with a placeholder. The place holder will be in the
        graph and it will know it needs to have a size of [vsz, dsz] but it doesn't
        have the actual values so it can be serialized into a protocol buffer since
        it is small.

        We then have a variable that is initialized with the value of the
        placeholder. This is filled in with value during the `sess.run` of
        `tf.global_variables_initialzier` with a feed_dict. Values are then saved
        into the checkpoint and can be reloaded from there.

        ```
        sess.run(tf.global_variables_initializer(), {e.W_place: e.weights})
        ```
        """
        if x is None:
            x = LookupTableEmbeddings.create_placeholder(self.name)
        self.x = x

        with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE):

            self.W_place = tf.placeholder(tf.float32,
                                          shape=(self.vsz, self.dsz))
            W = tf.get_variable("W", initializer=self.W_place)

            e0 = tf.scatter_update(W, tf.constant(0, dtype=tf.int32,
                                                  shape=[1]),
                                   tf.zeros(shape=[1, self.dsz]))

            with tf.control_dependencies([e0]):
                embedding_w_dropout = tf.layers.dropout(W,
                                                        self.dropin,
                                                        noise_shape=(self.vsz,
                                                                     1),
                                                        training=TRAIN_FLAG())
                word_embeddings = tf.nn.embedding_lookup(
                    embedding_w_dropout, self.x)

        return word_embeddings
Пример #13
0
def fit(model_params, _, ts, vs=None, **kwargs):
    """Calibrate a model with temperature scaling"""

    model_file = get_model_file('classify', 'tf', kwargs.get('basedir'))

    batchsz = kwargs['batchsz']
    lengths_key = model_params.get('lengths_key')

    ## First, make tf.datasets for ts, vs and es
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md
    # effective_batch_sz = args.batchsz*args.gpus
    train_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(ts, lengths_key))
    train_dataset = train_dataset.batch(batchsz, drop_remainder=False)
    train_dataset = train_dataset.repeat(2)
    train_dataset = train_dataset.prefetch(NUM_PREFETCH)

    valid_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(vs, lengths_key))
    valid_dataset = valid_dataset.batch(batchsz, drop_remainder=False)
    valid_dataset = valid_dataset.repeat(2)
    valid_dataset = valid_dataset.prefetch(NUM_PREFETCH)

    iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(train_dataset),
                                                     tf.compat.v1.data.get_output_shapes(train_dataset))

    features, y = iter.get_next()
    # Add features to the model params
    model_params.update(features)
    model_params['y'] = tf.one_hot(tf.reshape(y, [-1]), len(model_params['labels']))
    # create the initialisation operations
    train_init_op = iter.make_initializer(train_dataset)
    valid_init_op = iter.make_initializer(valid_dataset)

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    TRAIN_FLAG()
    trainer = create_trainer(model_params, **kwargs)
    last_improved = 0

    trainer.sess.run(train_init_op)
    trainer.train(ts, reporting_fns)
    trainer.sess.run(valid_init_op)
    test_metrics = trainer.test(vs, reporting_fns, phase='Test')

    trainer.checkpoint()
    trainer.model.save(model_file)
Пример #14
0
    def encode(self, x=None):
        if x is None:
            x = CharConvEmbeddings.create_placeholder(self.name)
        self.x = x

        ech0 = tf.scatter_update(self.Wch,
                                 tf.constant(0, dtype=tf.int32, shape=[1]),
                                 tf.zeros(shape=[1, self.dsz]))

        mxlen = tf.shape(self.x)[1]

        gating_fn = highway_conns if self.gating.startswith(
            'highway') else skip_conns

        with tf.variable_scope("Chars2Word"):
            with tf.control_dependencies([ech0]):
                mxwlen = tf.shape(self.x)[-1]
                char_bt_x_w = tf.reshape(self.x, [-1, mxwlen])
                # The ablation table (4) in https://arxiv.org/pdf/1708.02182.pdf shows this has a massive impact
                embedding_w_dropout = tf.layers.dropout(self.Wch,
                                                        self.dropin,
                                                        noise_shape=(self.vsz,
                                                                     1),
                                                        training=TRAIN_FLAG())
                cembed = tf.nn.embedding_lookup(embedding_w_dropout,
                                                char_bt_x_w,
                                                name="embeddings")
                cmot, num_filts = char_word_conv_embeddings(
                    cembed,
                    self.filtsz,
                    self.dsz,
                    self.nfeats,
                    activation_fn=tf_activation(self.activation),
                    gating=gating_fn,
                    num_gates=self.num_gates)

        if self.projsz:
            cmot = tf.matmul(cmot, self.Wp) + self.bp
        word_char = tf.reshape(cmot, [-1, mxlen, self.outsz])
        return word_char
Пример #15
0
    def _train(self, loader, dataset=True, **kwargs):
        reporting_fns = kwargs.get('reporting_fns', [])
        pg = create_progress_bar(len(loader))
        epoch_loss = 0
        epoch_norm = 0
        epoch_logits = []
        epoch_probs = []
        epoch_y = []
        start = time.time()
        for batch_dict in pg(loader):
            if dataset:
                logits, probs, lossv, y = self.sess.run(
                    [self.logits, self.probs, self.loss, self.y],
                    feed_dict={TRAIN_FLAG(): 0}
                )
                batchsz = len(y)
            else:
                feed_dict = self.model.make_input(batch_dict, False)
                logits, probs, lossv, y = self.sess.run(
                    [self.logits, self.probs, self.loss, self.y],
                    feed_dict=feed_dict
                )
                batchsz = self._get_batchsz(batch_dict)
            epoch_loss += lossv * batchsz
            epoch_norm += batchsz
            epoch_logits.append(logits)
            epoch_probs.append(probs)
            epoch_y.append(y)
        logits = np.concatenate(epoch_logits, axis=0)
        probs = np.concatenate(epoch_probs, axis=0)
        y = np.argmax(np.concatenate(epoch_y, axis=0), axis=1)
        bins = multiclass_calibration_bins(y, probs, bins=int(kwargs.get('bins', 10)))
        metrics = {
            "ECE": expected_calibration_error(bins.accs, bins.confs, bins.counts),
            "MCE": maximum_calibration_error(bins.accs, bins.confs, bins.counts),
            "avg_loss": epoch_loss / float(epoch_norm)
        }

        self.report(
            0, metrics, start, "Train-Before", "EPOCH", reporting_fns, 1
        )

        # Fit
        import tensorflow_probability as tfp
        x = tf.constant(logits)
        y = tf.constant(y)
        def scale(p):
            return tfp.math.value_and_gradient(
                lambda v: tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=x / v, labels=y
                )),
                p
            )

        results = tfp.optimizer.lbfgs_minimize(
            value_and_gradients_function=scale,
            initial_position=self.model.trainable_variables[0],
            max_iterations=100
        )
        self.sess.run(self.model.trainable_variables[0].assign(results.position))

        metrics = self._test(loader, dataset, phase="Train-After", **kwargs)
        return metrics
Пример #16
0
    def pool(self, word_embeddings, dsz, init, **kwargs):
        """LSTM with dropout yielding a final-state as output

        :param word_embeddings: The input word embeddings
        :param dsz: The input word embedding depth
        :param init: The tensorflow initializer to use (currently ignored)
        :param kwargs: See below

        :Keyword Arguments:
        * *rnnsz* -- (``int``) The number of hidden units (defaults to `hsz`)
        * *hsz* -- (``int``) backoff for `rnnsz`, typically a result of stacking params.  This keeps things simple so
          its easy to do things like residual connections between LSTM and post-LSTM stacking layers

        :return:
        """
        hsz = kwargs.get('rnnsz', kwargs.get('hsz', 100))
        vdrop = bool(kwargs.get('variational_dropout', False))
        if type(hsz) is list:
            hsz = hsz[0]

        rnntype = kwargs.get('rnn_type', kwargs.get('rnntype', 'lstm'))
        nlayers = int(kwargs.get('layers', 1))

        if rnntype == 'blstm':
            rnnfwd = stacked_lstm(hsz // 2,
                                  self.pdrop_value,
                                  nlayers,
                                  variational=vdrop,
                                  training=TRAIN_FLAG())
            rnnbwd = stacked_lstm(hsz // 2,
                                  self.pdrop_value,
                                  nlayers,
                                  variational=vdrop,
                                  training=TRAIN_FLAG())
            ((_, _), (fw_final_state,
                      bw_final_state)) = tf.nn.bidirectional_dynamic_rnn(
                          rnnfwd,
                          rnnbwd,
                          word_embeddings,
                          sequence_length=self.lengths,
                          dtype=tf.float32)
            # The output of the BRNN function needs to be joined on the H axis
            output_state = tf.concat(
                [fw_final_state[-1].h, bw_final_state[-1].h], -1)
            out_hsz = hsz

        else:
            rnnfwd = stacked_lstm(hsz,
                                  self.pdrop_value,
                                  nlayers,
                                  variational=vdrop,
                                  training=TRAIN_FLAG())
            (_,
             (output_state)) = tf.nn.dynamic_rnn(rnnfwd,
                                                 word_embeddings,
                                                 sequence_length=self.lengths,
                                                 dtype=tf.float32)
            output_state = output_state[-1].h
            out_hsz = hsz

        combine = tf.reshape(output_state, [-1, out_hsz])
        return combine
Пример #17
0
    def train(self, ts, reporting_fns, dataset=True):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """
        epoch_loss = 0.0
        epoch_toks = 0

        if self.model.requires_state:
            state = self.model.sess.run(self.model.initial_state,
                                        self.model.make_input(ts[0], True))

        fetches = {
            "loss": self.loss,
            "train_op": self.train_op,
            "global_step": self.global_step
        }

        if self.model.requires_state:
            fetches["final_state"] = self.model.final_state

        start = time.time()
        self.nstep_start = start
        for batch_dict in ts:

            if dataset:
                feed_dict = {TRAIN_FLAG(): 1}
            else:
                feed_dict = self.model.make_input(batch_dict, True)
                _, global_step, lossv = self.sess.run(
                    [self.train_op, self.global_step, self.loss],
                    feed_dict=feed_dict)

            # In Keras LSTM, the order is h first, c second, its the opposite in TF 1, however I dont think it
            # ends up mattering here
            if self.model.requires_state:
                for i, (s1, s2) in enumerate(self.model.initial_state):
                    feed_dict[s1] = state[i][0]  #.c  # 0
                    feed_dict[s2] = state[i][1]  #.h  # 1

            vals = self.model.sess.run(fetches, feed_dict)
            loss = vals["loss"]

            if self.model.requires_state:
                state = vals["final_state"]
            global_step = vals["global_step"]
            toks = self._num_toks(batch_dict)
            report_loss = loss * toks
            epoch_loss += report_loss
            epoch_toks += toks
            self.nstep_agg += report_loss
            self.nstep_div += toks

            if (global_step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                self.report(global_step + 1, metrics, self.nstep_start,
                            'Train', 'STEP', reporting_fns, self.nsteps)
                self.reset_nstep()

        metrics = self.calc_metrics(epoch_loss, epoch_toks)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics
Пример #18
0
    def create(cls, embeddings, labels, **kwargs):
        """The main method for creating all :class:`WordBasedModel` types.

        This method instantiates a model with pooling and optional stacking layers.
        Many of the arguments provided are reused by each implementation, but some sub-classes need more
        information in order to properly initialize.  For this reason, the full list of keyword args are passed
        to the :method:`pool` and :method:`stacked` methods.

        :param embeddings: This is a dictionary of embeddings, mapped to their numerical indices in the lookup table
        :param labels: This is a list of the `str` labels
        :param kwargs: There are sub-graph specific Keyword Args allowed for e.g. embeddings. See below for known args:

        :Keyword Arguments:
        * *gpus* -- (``int``) How many GPUs to split training across.  If called this function delegates to
            another class `ClassifyParallelModel` which creates a parent graph and splits its inputs across each
            sub-model, by calling back into this exact method (w/o this argument), once per GPU
        * *model_type* -- The string name for the model (defaults to `default`)
        * *sess* -- An optional tensorflow session.  If not passed, a new session is
            created
        * *lengths_key* -- (``str``) Specifies which `batch_dict` property should be used to determine the temporal length
            if this is not set, it defaults to either `word`, or `x` if `word` is also not a feature
        * *finetune* -- Are we doing fine-tuning of word embeddings (defaults to `True`)
        * *mxlen* -- The maximum signal (`x` tensor temporal) length (defaults to `100`)
        * *dropout* -- This indicates how much dropout should be applied to the model when training.
        * *filtsz* -- This is actually a top-level param due to an unfortunate coupling between the pooling layer
            and the input, which, for convolution, requires input padding.

        :return: A fully-initialized tensorflow classifier
        """
        TRAIN_FLAG()
        gpus = kwargs.get('gpus', 1)
        if gpus == -1:
            gpus = len(
                os.getenv('CUDA_VISIBLE_DEVICES', os.getenv('NV_GPU',
                                                            '0')).split(','))
            kwargs['gpus'] = gpus
        if gpus > 1:
            return ClassifyParallelModel(cls.create, embeddings, labels,
                                         **kwargs)
        sess = kwargs.get('sess', tf.Session())

        model = cls()
        model.embeddings = embeddings
        model._record_state(**kwargs)
        model.lengths_key = kwargs.get('lengths_key')
        if model.lengths_key is not None:
            model.lengths = kwargs.get(
                'lengths', tf.placeholder(tf.int32, [None], name="lengths"))
        else:
            model.lengths = None

        model.labels = labels
        nc = len(labels)
        model.y = kwargs.get('y', tf.placeholder(tf.int32, [None, nc],
                                                 name="y"))
        # This only exists to make exporting easier
        model.pdrop_value = kwargs.get('dropout', 0.5)
        # This only exists to make exporting easier

        with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE):

            seed = np.random.randint(10e8)
            init = tf.random_uniform_initializer(-0.05,
                                                 0.05,
                                                 dtype=tf.float32,
                                                 seed=seed)
            word_embeddings = model.embed(**kwargs)
            input_sz = word_embeddings.shape[-1]
            pooled = model.pool(word_embeddings, input_sz, init, **kwargs)
            stacked = model.stacked(pooled, init, **kwargs)

            # For fully connected layers, use xavier (glorot) transform
            with tf.variable_scope("output"):

                model.logits = tf.identity(tf.layers.dense(
                    stacked,
                    nc,
                    activation=None,
                    kernel_initializer=tf.glorot_uniform_initializer(seed)),
                                           name="logits")
                model.best = tf.argmax(model.logits, 1, name="best")
                model.probs = tf.nn.softmax(model.logits, name="probs")
        model.sess = sess
        # writer = tf.summary.FileWriter('blah', sess.graph)

        return model
Пример #19
0
def fit(model_params, ts, vs, es, **kwargs):
    """
    Train a classifier using TensorFlow with a `feed_dict`.  This
    is the previous default behavior for training.  To use this, you need to pass
    `fit_func: feed_dict` in your MEAD config

    :param model_params: The model to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """
    epochs = int(kwargs.get('epochs', 5))
    patience = int(kwargs.get('patience', epochs))
    conll_output = kwargs.get('conll_output', None)
    span_type = kwargs.get('span_type', 'iob')
    txts = kwargs.get('txts', None)
    model_file = get_model_file('tagger', 'tf', kwargs.get('basedir'))
    TRAIN_FLAG()

    trainer = create_trainer(model_params, **kwargs)

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    verbose = bool(kwargs.get('verbose', False))

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'acc')
        early_stopping_cmp, best_metric = get_metric_cmp(
            early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' %
              (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    last_improved = 0
    for epoch in range(epochs):

        trainer.train(ts, reporting_fns)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric],
                                best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' %
              (early_stopping_metric, best_metric, last_improved))
    if es is not None:

        trainer.recover_last_checkpoint()
        # What to do about overloading this??
        evaluator = TaggerEvaluatorTf(trainer.model, span_type, verbose)
        timer = Timer()
        test_metrics = evaluator.test(es, conll_output=conll_output, txts=txts)
        duration = timer.elapsed()
        for reporting in reporting_fns:
            reporting(test_metrics, 0, 'Test')
        trainer.log.debug({'phase': 'Test', 'time': duration})
Пример #20
0
def fit_datasets(model_params, ts, vs, es=None, **kwargs):
    """
    Train a tagger using TensorFlow with `tf.dataset`.  This
    is the default behavior for training.

    :param model_params: The model (or parameters to create the model) to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """
    conll_output = kwargs.get('conll_output', None)
    span_type = kwargs.get('span_type', 'iob')
    txts = kwargs.get('txts', None)
    model_file = get_model_file('tagger', 'tf', kwargs.get('basedir'))

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))
    verbose = kwargs.get('verbose', {'console': kwargs.get('verbose_console', False), 'file': kwargs.get('verbose_file', None)})
    epochs = int(kwargs.get('epochs', 20))

    batchsz = kwargs['batchsz']
    ## First, make tf.datasets for ts, vs and es
    # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/distribute/README.md
    # effective_batch_sz = args.batchsz*args.gpus
    test_batchsz = kwargs.get('test_batchsz', batchsz)
    # This is a little awkward:
    lengths_key = model_params.get('lengths_key')

    train_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(ts, lengths_key))
    train_dataset = train_dataset.shuffle(buffer_size=SHUF_BUF_SZ)
    train_dataset = train_dataset.batch(batchsz, drop_remainder=False)
    train_dataset = train_dataset.repeat(epochs + 1)
    train_dataset = train_dataset.prefetch(NUM_PREFETCH)

    valid_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(vs, lengths_key))
    valid_dataset = valid_dataset.batch(batchsz, drop_remainder=False)
    valid_dataset = valid_dataset.repeat(epochs + 1)
    valid_dataset = valid_dataset.prefetch(NUM_PREFETCH)

    iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(train_dataset),
                                                     tf.compat.v1.data.get_output_shapes(train_dataset))

    features, y = iter.get_next()
    # Add features to the model params
    model_params.update(features)
    model_params['y'] = y
    # create the initialisation operations
    train_init_op = iter.make_initializer(train_dataset)
    valid_init_op = iter.make_initializer(valid_dataset)

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'acc')
        early_stopping_cmp, best_metric = get_metric_cmp(early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' % (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    TRAIN_FLAG()
    trainer = create_trainer(model_params, **kwargs)

    last_improved = 0

    for epoch in range(epochs):
        trainer.sess.run(train_init_op)
        trainer.train(ts, reporting_fns)
        trainer.sess.run(valid_init_op)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' % (early_stopping_metric, best_metric, last_improved))

    if es is not None:
        print('Reloading best checkpoint')
        trainer.recover_last_checkpoint()

        test_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(es, lengths_key))
        test_dataset = test_dataset.batch(test_batchsz, drop_remainder=False)
        test_dataset = test_dataset.repeat(epochs + 1)
        test_dataset = test_dataset.prefetch(NUM_PREFETCH)
        test_init_op = iter.make_initializer(test_dataset)
        trainer.sess.run(test_init_op)
        # What to do about overloading this??
        evaluator = TaggerEvaluatorTf(trainer.model, span_type, verbose)
        start = time.time()
        test_metrics = evaluator.test(es, conll_output=conll_output, txts=txts)
        duration = time.time() - start
        for reporting in reporting_fns:
            reporting(test_metrics, 0, 'Test')
        trainer.log.debug({'phase': 'Test', 'time': duration})
Пример #21
0
def fit_datasets(model_params, ts, vs, es=None, **kwargs):
    """
    Train an language model using TensorFlow with `tf.dataset`.  This
    is the default behavior for training.

    :param model_params: The model (or parameters to create the model) to train
    :param ts: A training data set
    :param vs: A validation data set
    :param es: A test data set, can be None
    :param kwargs:
        See below

    :Keyword Arguments:
        * *do_early_stopping* (``bool``) --
          Stop after evaluation data is no longer improving.  Defaults to True
        * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on
        * *epochs* (``int``) -- how many epochs.  Default to 20
        * *outfile* -- Model output file, defaults to classifier-model.pyth
        * *patience* --
           How many epochs where evaluation is no longer improving before we give up
        * *reporting* --
           Callbacks which may be used on reporting updates
        * *nsteps* (`int`) -- If we should report every n-steps, this should be passed
        * *ema_decay* (`float`) -- If we are doing an exponential moving average, what decay to us4e
        * *clip* (`int`) -- If we are doing gradient clipping, what value to use
        * *optim* (`str`) -- The name of the optimizer we are using
        * *lr* (`float`) -- The learning rate we are using
        * *mom* (`float`) -- If we are using SGD, what value to use for momentum
        * *beta1* (`float`) -- Adam-specific hyper-param, defaults to `0.9`
        * *beta2* (`float`) -- Adam-specific hyper-param, defaults to `0.999`
        * *epsilon* (`float`) -- Adam-specific hyper-param, defaults to `1e-8

    :return: None
    """

    epochs = int(kwargs.get('epochs', 5))
    patience = int(kwargs.get('patience', epochs))

    model_file = get_model_file('lm', 'tf', kwargs.get('basedir'))

    do_early_stopping = bool(kwargs.get('do_early_stopping', True))

    best_metric = 0
    if do_early_stopping:
        early_stopping_metric = kwargs.get('early_stopping_metric', 'avg_loss')
        early_stopping_cmp, best_metric = get_metric_cmp(early_stopping_metric, kwargs.get('early_stopping_cmp'))
        patience = kwargs.get('patience', epochs)
        print('Doing early stopping on [%s] with patience [%d]' % (early_stopping_metric, patience))

    reporting_fns = listify(kwargs.get('reporting', []))
    print('reporting', reporting_fns)

    batchsz = kwargs['batchsz']
    test_batchsz = kwargs.get('test_batchsz', batchsz)
    tgt_key = model_params.get('tgt_key')

    train_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(ts))
    train_dataset = train_dataset.shuffle(buffer_size=SHUF_BUF_SZ)
    train_dataset = train_dataset.batch(batchsz, drop_remainder=False)
    train_dataset = train_dataset.repeat(epochs + 1)
    train_dataset = train_dataset.prefetch(NUM_PREFETCH)

    valid_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(vs))
    valid_dataset = valid_dataset.batch(batchsz, drop_remainder=False)
    valid_dataset = valid_dataset.repeat(epochs + 1)
    valid_dataset = valid_dataset.prefetch(NUM_PREFETCH)

    iter = tf.compat.v1.data.Iterator.from_structure(tf.compat.v1.data.get_output_types(train_dataset),
                                                     tf.compat.v1.data.get_output_shapes(train_dataset))

    features, tgt = iter.get_next()
    # Add features to the model params
    model_params.update(features)
    model_params.update({'y': tgt})

    # create the initialization operations
    train_init_op = iter.make_initializer(train_dataset)
    valid_init_op = iter.make_initializer(valid_dataset)

    TRAIN_FLAG()
    trainer = create_trainer(model_params, **kwargs)

    last_improved = 0

    for epoch in range(epochs):
        trainer.sess.run(train_init_op)
        trainer.train(ts, reporting_fns)
        trainer.sess.run(valid_init_op)
        test_metrics = trainer.test(vs, reporting_fns, phase='Valid')

        if do_early_stopping is False:
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif early_stopping_cmp(test_metrics[early_stopping_metric], best_metric):
            last_improved = epoch
            best_metric = test_metrics[early_stopping_metric]
            print('New best %.3f' % best_metric)
            trainer.checkpoint()
            trainer.model.save(model_file)

        elif (epoch - last_improved) > patience:
            print('Stopping due to persistent failures to improve')
            break

    if do_early_stopping is True:
        print('Best performance on %s: %.3f at epoch %d' % (early_stopping_metric, best_metric, last_improved))

    if es is not None:
        print('Reloading best checkpoint')
        trainer.recover_last_checkpoint()
        test_dataset = tf.data.Dataset.from_tensor_slices(to_tensors(es))
        test_dataset = test_dataset.batch(test_batchsz, drop_remainder=False)
        test_dataset = test_dataset.repeat(epochs + 1)
        test_dataset = test_dataset.prefetch(NUM_PREFETCH)
        test_init_op = iter.make_initializer(test_dataset)
        trainer.sess.run(test_init_op)
        trainer.test(es, reporting_fns, phase='Test')