def __init__(self): self.ps = U.Params(params).init_comps() self.pre = None self.post = None i = tf.constant([0.] * (4 * 10), shape=(4, 10)) self.src_b = tf.Variable(initial_value=i) i = tf.constant([0.] * (4 * 10), shape=(4, 10)) self.mem_b = tf.Variable(initial_value=i)
def test_with_owner(): a = L.Attn(Owner()) a.build([(4, 10, 16), (), (4, 18, 16), ()]) src = tf.constant([0.] * (4 * 10 * 16), shape=(4, 10, 16)) bias = tf.constant([0.] * (4 * 10), shape=(4, 10)) bias = tf.expand_dims(tf.expand_dims(bias, axis=1), axis=3) mem = tf.constant([0.] * (4 * 15 * 16), shape=(4, 15, 16)) ctx = tf.constant([0.] * (4 * 15 * 16), shape=(4, 15, 16)) a.call([src, bias, mem, ctx])
def test_owner_none(): a = L.Attn(Owner()) a.build([(4, 10, 16)]) src = tf.constant([0.] * (4 * 10 * 16), shape=(4, 10, 16)) a.call([src]) bias = tf.constant([0.] * (4 * 10), shape=(4, 10)) bias = tf.expand_dims(tf.expand_dims(bias, axis=1), axis=3) a.call([src, bias]) ctx = tf.constant([0.] * (4 * 15 * 16), shape=(4, 15, 16)) a.call([src, bias, None, ctx])
def test_tokembed(): e = TokEmbed(ps) e.build((1, 5)) src = tf.constant([1, 2, 0, 3, 0], shape=(1, 5)) e.call(src) ps.emb_one_hot = True e = TokEmbed(ps) e.build((1, 5)) e.call(src)
def top_logp(self, ctx, bias, i): cfg = self.cfg y = tf.zeros(( cfg.batch_size, cfg.beam_size, cfg.num_toks, )) y += tf.expand_dims(self.logp, axis=2) b = tf.range(cfg.batch_size) ii = tf.constant([i] * cfg.batch_size) for j in range(cfg.beam_size): jj = tf.constant([j] * cfg.batch_size) sel = tf.stack([b, jj, ii]) yj = self.to_logp(self.tgt[:, j, :], ctx, bias, i)[1] y = tf.tensor_scatter_nd_add(y, sel, yj) y = tf.reshape(y, (-1, cfg.beam_size * cfg.num_toks)) logp, idx = tf.top_k(y, k=2 * cfg.beam_size) return logp, idx
def build(self, input_shape): cfg = self.cfg tgt = input_shape[0] assert tgt[0] == cfg.batch_size y = tf.constant([[0.] + [-float('inf')] * (cfg.beam_size - 1)]) self._logp = tf.tile(y, [cfg.batch_size, 1]) sh = (cfg.batch_size, cfg.beam_size) self._score = tf.ones(shape=sh) * utils.big_neg self._flag = tf.zeros(dtype='bool', shape=sh) return super().build(input_shape)
def test_w_grad(): e = TokEmbed(ps) e.build((None, 3)) ins = tf.constant([[0, 1, 0]], dtype='int32') with tf.GradientTape() as tape: out = e(ins) print('===', out, e.weights) gs = tape.gradient(out, e.weights) opt = adagrad.AdagradOptimizer(0.1) opt.apply_gradients(zip(gs, e.weights)) print('###', len(gs), 1)
def __call__(self, step): lr = tf.constant(1.0) for name in [n.strip() for n in self.schedule.split('*')]: if name == 'constant': lr *= self.constant elif name == 'linear_warmup': lr *= tf.minimum(1.0, step / self.warmup_steps) else: assert name == 'rsqrt_decay' lr *= tf.rsqrt(tf.maximum(step, self.warmup_steps)) tf.scalar('learning_rate', lr) return lr
def append_tok(self, idx, i, **kw): cfg = self.cfg k = 2 * cfg.beam_size b = tf.range(cfg.batch_size * k) // k b = tf.reshape(b, (cfg.batch_size, k)) beam = idx // cfg.num_toks sel = tf.stack([b, beam], axis=2) y = tf.gather_nd(self.tgt, sel) ii = tf.constant([i] * cfg.batch_size * k) ii = tf.reshape(ii, (cfg.batch_size, k)) sel = tf.stack([b, beam, ii], axis=2) u = tf.expand_dims(idx % cfg.num_toks, axis=2) tgt = tf.tensor_scatter_nd_update(y, sel, u) return tgt
def unpack(): for v in vs: yield v.numpy() if isinstance(v, type(tf.constant(0))) else v
def test_shift(): a = L.Attn(Owner()) x = tf.constant([1, 2, 3, 4, 5, 6], shape=(1, 1, 2, 3)) tf.print(x) x = a.shift(x) tf.print(x)