Exemplo n.º 1
0
def test_windowed_ra():
    num_heads = 4
    d_model = 64
    rpr_k = 1
    batchsize = 2
    nctx = 256
    d_k = d_model // num_heads

    with tf.device("/cpu:0"):
        old = SeqScaledDotProductRelativeAttention(pdrop=0.)
        new = SeqScaledWindowedRelativeAttention(pdrop=0.)

        rpr_key_emb = tf.keras.layers.Embedding(2 * rpr_k + 1, d_k)
        rpr_value_emb = tf.keras.layers.Embedding(2 * rpr_k + 1, d_k)

        Q = tf.random.normal([batchsize, num_heads, nctx, d_k])
        K = tf.random.normal([batchsize, num_heads, nctx, d_k])
        V = tf.random.normal([batchsize, num_heads, nctx, d_k])
        lengths = tf.random.uniform([
            batchsize,
        ], 0, nctx, dtype=tf.int32)
        seq_mask = tf.sequence_mask(lengths, maxlen=nctx, dtype=tf.float32)
        in_mask = tf.expand_dims(tf.expand_dims(seq_mask, 1), 1)
        out_mask = tf.expand_dims(tf.expand_dims(seq_mask, 1), -1)

        # manually create a ra_mask to prevent attention beyond rpr_k
        ones = tf.ones([nctx, nctx])
        ra_mask = tf.linalg.band_part(ones, rpr_k, rpr_k)
        mask = in_mask * tf.expand_dims(tf.expand_dims(ra_mask, 0), 0)
        rpr_key_old, rpr_value_old = make_rpr(rpr_key_emb, rpr_value_emb,
                                              rpr_k, nctx)
        SET_TRAIN_FLAG(False)
        out_old = old((Q, K, V, rpr_key_old, rpr_value_old, mask))
        out_old = masked_fill(out_old, tf.equal(out_mask, 0), 1)
        print(out_old.shape)

        # using the windowed relative attention with the original sequence mask
        rpr_key_new, rpr_value_new = unfold_rpr(rpr_key_emb, rpr_value_emb,
                                                rpr_k)
        out_new = new((Q, K, V, rpr_key_new, rpr_value_new, in_mask))
        out_new = masked_fill(out_new, tf.equal(out_mask, 0), 1)
        print(out_new.shape)
        if get_version(tf) < 2:
            with tf.compat.v1.Session() as sess:
                out_old, out_new = sess.run([out_old, out_new])
        else:
            out_old, out_new = out_old.numpy(), out_new.numpy()

        assert np.allclose(out_old, out_new, atol=1e-6)
Exemplo n.º 2
0
def test_scaled_attn_value(qkv):
    q, k, v = qkv
    with tf.device("/cpu:0"):
        q = tf.zeros_like(q)
        scaled_dot_product_attention = SeqScaledDotProductAttention(0.0)
        res = scaled_dot_product_attention((q, k, v, None))
        if get_version(tf) < 2:
            with tf.compat.v1.Session() as sess:
                res, gold = sess.run([res, v])
        else:
            res, gold = res.numpy(), v.numpy()
        B, H, T, _ = q.get_shape().as_list()
        for b in range(B):
            for h in range(H):
                for t in range(T):
                    np.testing.assert_allclose(res[b, h, t, :],
                                               np.mean(gold, axis=2)[b, h, :],
                                               atol=1e-5)
import pytest
import numpy as np
from eight_mile.utils import get_version
from eight_mile.embeddings import RandomInitVecModel
from collections import namedtuple
import string
tf = pytest.importorskip('tensorflow')
pytestmark = pytest.mark.skipif(get_version(tf) < 2, reason='TF1.X')
from eight_mile.utils import Offsets


def test_rnn_decode_shapes():
    from baseline.tf.embeddings import LookupTableEmbeddingsModel
    from baseline.tf.seq2seq.decoders import RNNDecoder
    # Always pick the right path
    encoder = namedtuple("EncoderOutput", "output src_mask")
    batchsz = 2
    temporal = 7
    temporal_output = 4
    hsz = 20
    dsz = 10
    layers = 1
    # Always pick the right path
    wv = RandomInitVecModel(
        dsz, {k: 1 for k in list(string.ascii_letters)}
    )
    assert len(string.ascii_letters) + len(Offsets.VALUES) == wv.get_vsz()
    encoder.output = tf.cast(np.random.randn(batchsz, temporal, hsz), dtype=tf.float32)
    encoder.hidden = (tf.cast(np.random.randn(layers, batchsz, hsz), dtype=tf.float32),
                      tf.cast(np.random.randn(layers, batchsz, hsz), dtype=tf.float32))
    encoder.src_mask = np.zeros((batchsz, temporal), dtype=np.uint8)
Exemplo n.º 4
0
import os
import pytest
import numpy as np
tf = pytest.importorskip('tensorflow')
from eight_mile.utils import get_version
pytestmark = pytest.mark.skipif(get_version(tf) >= 2, reason='tf2.0')

from baseline.tf.tfy import tie_weight


@pytest.fixture(scope="module")
def set_cpu():
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    yield
    del os.environ['CUDA_VISIBLE_DEVICES']


def test_sharing():
    # For some reason I can't get this to stop trying to use the gpu which causes it to fail
    with tf.device('/cpu:0'):
        input_ = tf.compat.v1.placeholder(tf.int32, shape=[None])
        weight = tf.get_variable("weight",
                                 shape=[100, 200],
                                 initializer=tf.random_normal_initializer())
        embed = tf.nn.embedding_lookup(weight, input_)

        tie_shape = [weight.get_shape()[-1], weight.get_shape()[0]]
        with tf.variable_scope("Share",
                               custom_getter=tie_weight(weight, tie_shape)):
            layer = tf.layers.Dense(
                100,
Exemplo n.º 5
0
import os
import json
import pytest
import numpy as np
from eight_mile.utils import get_version

tf = pytest.importorskip("tensorflow")
pytestmark = pytest.mark.skipif(get_version(tf) >= 2, reason="TF2.0")
from eight_mile.optz import (
    create_lr_scheduler,
    ConstantScheduler,
    WarmupLinearScheduler,
    CyclicLRScheduler,
    PiecewiseDecayScheduler,
    ZarembaDecayScheduler,
    CosineDecayScheduler,
    InverseTimeDecayScheduler,
    ExponentialDecayScheduler,
)
import numpy as np


@pytest.fixture(scope="module")
def set_cpu():
    os.environ["CUDA_VISIBLE_DEVICES"] = ""
    yield
    del os.environ["CUDA_VISIBLE_DEVICES"]


INIT_LR = 1.2
NUM_STEPS = 1000
Exemplo n.º 6
0
    def train(self, ts, reporting_fns):
        """Train by looping over the steps

        For a `tf.dataset`-backed `fit_func`, we are using the previously wired `dataset`s
        in the model (and `dataset` is `True`).  For `feed_dict`, we convert the ts samples
        to `feed_dict`s and hand them in one-by-one

        :param ts: The training set
        :param reporting_fns: A list of reporting hooks
        :param dataset: (`bool`) Are we using `tf.dataset`s
        :return: Metrics
        """

        SET_TRAIN_FLAG(True)
        epoch_loss = tf.Variable(0.0)
        epoch_div = tf.Variable(0, dtype=tf.int32)
        nstep_loss = tf.Variable(0.0)
        nstep_div = tf.Variable(0, dtype=tf.int32)
        self.nstep_start = time.perf_counter()
        start = time.perf_counter()

        def _train_step_no_state(inputs):
            """Replicated training step."""

            features, y = inputs
            loss = self.optimizer.update(self.model, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return report_loss, toks

        def _train_step_with_state(inputs, hidden):
            """Replicated training step."""

            features, y = inputs
            loss, hidden = self.optimizer.update_with_hidden(
                self.model, hidden, features, y)
            toks = self._num_toks(y)
            report_loss = loss * tf.cast(toks, tf.float32)
            return hidden, report_loss, toks

        if get_version(tf) >= 2:
            _train_step_with_state = tf.function(_train_step_with_state)
            _train_step_no_state = tf.function(_train_step_no_state)

        h = None
        for inputs in ts:
            if self.model.requires_state:
                h, step_report_loss, step_toks = _train_step_with_state(
                    inputs, h)
            else:
                step_report_loss, step_toks = _train_step_no_state(inputs)

            epoch_loss.assign_add(step_report_loss)
            nstep_loss.assign_add(step_report_loss)
            epoch_div.assign_add(step_toks)
            nstep_div.assign_add(step_toks)

            step = self.optimizer.global_step.numpy() + 1
            if step % self.nsteps == 0:
                metrics = self.calc_metrics(nstep_loss.numpy(),
                                            nstep_div.numpy())
                self.report(step, metrics, self.nstep_start, 'Train', 'STEP',
                            reporting_fns, self.nsteps)
                nstep_loss.assign(0.0)
                nstep_div.assign(0)
                self.nstep_start = time.perf_counter()

        epoch_loss = epoch_loss.numpy()
        epoch_div = epoch_div.numpy()
        metrics = self.calc_metrics(epoch_loss, epoch_div)
        self.train_epochs += 1
        self.report(self.train_epochs, metrics, start, 'Train', 'EPOCH',
                    reporting_fns)
        return metrics
Exemplo n.º 7
0
        res = dot_product_attention((q, k, v, None))
        if get_version(tf) < 2:
            with tf.compat.v1.Session() as sess:
                res, gold = sess.run([res, v])
        else:
            res, gold = res.numpy(), v.numpy()
        B, H, T, _ = q.get_shape().as_list()
        for b in range(B):
            for h in range(H):
                for t in range(T):
                    np.testing.assert_allclose(res[b, h, t, :],
                                               np.mean(gold, axis=2)[b, h, :],
                                               atol=1e-5)


@pytest.mark.skipif(get_version(tf) < 2, reason="needs tf2")
def test_attn_value_seq_mask(qkv):
    q, k, v = qkv
    with tf.device("/cpu:0"):
        B, H, T, _ = q.get_shape().as_list()
        q = tf.zeros_like(q)
        lens = np.random.randint(1, T, size=B).astype(np.int32)
        tf_lens = tf.constant(lens)
        mask = tf.expand_dims(
            tf.expand_dims(tf.sequence_mask(tf_lens, T, dtype=tf.float32), 1),
            1)
        dot_product_attention = SeqDotProductAttention(0.0)
        res = dot_product_attention((q, k, v, mask))
        res, gold = res.numpy(), v.numpy()
        for b in range(B):
            for h in range(H):
Exemplo n.º 8
0
from eight_mile.tf.layers import SET_TRAIN_FLAG, get_shape_as_list, autograph_options, masked_fill
from eight_mile.tf.optz import *
from baseline.tf.tfy import setup_tf2_checkpoints
from baseline.utils import get_model_file, get_metric_cmp
from baseline.train import EpochReportingTrainer, register_trainer, register_training_func
from baseline.model import create_model_for
import numpy as np

# Number of batches to prefetch if using tf.datasets
NUM_PREFETCH = 2
# The shuffle buffer
SHUF_BUF_SZ = 5000

log = logging.getLogger('baseline.timing')

TF_VERSION = get_version(tf)
if TF_VERSION < 2:
    tf.enable_eager_execution()


def to_tensors(ts, lengths_key):
    """Convert a data feed into a tuple of `features` (`dict`) and `y` values

    This method is required to produce `tf.dataset`s from the input data feed

    :param ts: The data feed to convert
    :return: A `tuple` of `features` and `y` (labels)
    """
    keys = ts[0].keys()
    features = dict((k, []) for k in keys)
    for sample in ts:
Exemplo n.º 9
0
        assert out.size(i) == shape[i]


def test_vec_log_sum_exp_batch_stable():
    h = np.random.randint(22, 41)
    i1 = torch.rand(1, h, h)
    i2 = torch.rand(1, h, h)
    i = torch.cat([i1, i2], dim=0)
    lse1 = vec_log_sum_exp(i1, 2)
    lse2 = vec_log_sum_exp(i2, 2)
    one_x_one = torch.cat([lse1, lse2], dim=0)
    lse = vec_log_sum_exp(i, 2)
    np.testing.assert_allclose(one_x_one.numpy(), lse.numpy())


@pytest.mark.skipif(get_version(torch) <= 1.4, reason="Old ONNX")
def test_ONNX_export():
    ort = pytest.importorskip("onnxruntime")

    v = ViterbiBatchSize1(Offsets.GO, Offsets.EOS)

    B = 1
    T = np.random.randint(10, 100)
    H = np.random.randint(24, 76)

    unary = torch.rand(T, B, H)
    trans = torch.rand(1, H, H)
    length = torch.randint(1, T, size=(B, ))

    p1, s1 = v(unary, trans, length)
Exemplo n.º 10
0
    WarmupLearningRateScheduler,
    WarmupLinearScheduler,
    CyclicLRScheduler,
    PiecewiseDecayScheduler,
    ZarembaDecayScheduler,
    CosineDecayScheduler,
    InverseTimeDecayScheduler,
    ExponentialDecayScheduler,
    CompositeLRScheduler,
)

logger = logging.getLogger("mead.layers")
__all__ = []
export = exporter(__all__)

if get_version(tf) < 2:

    @register_lr_scheduler("default")
    class ConstantSchedulerTensorFlow1:
        def __init__(self, **kwargs):
            pass

        def __call__(self, lr, global_step):
            return tf.identity(lr, name="lr")

        def __str__(self):
            return type(self).__name__ + "()"

    @register_lr_scheduler("warmup_linear")
    class WarmupLinearSchedulerTensorFlow1(WarmupLearningRateScheduler):
        def __init__(self, **kwargs):