def attention(query, value, mask, dim, head=8): """computes scaled dot-product attention query : tensor f32 (b, d_q, t) value : tensor f32 (b, d_v, s) mask : tensor f32 (b, t, s) -> tensor f32 (b, dim, t) `dim` must be divisible by `head` """ assert not dim % head d, h, c = dim, head, dim // head b, _, t = get_shape(query) b, _, s = get_shape(value) # pretransformations v = tf.reshape(layer_aff(value, dim, name='v'), (b, h, c, s)) # bhcs <- bds <- bvs k = tf.reshape(layer_aff(value, dim, name='k'), (b, h, c, s)) # bhcs <- bds <- bvs q = tf.reshape(layer_aff(query, dim, name='q'), (b, h, c, s)) # bhct <- bdt <- bqt # weight a = tf.matmul(q, k, transpose_a=True) # bhts <- (bhtc <- bhct) @ bhcs a *= c**-0.5 if mask is not None: a += tf.expand_dims(mask, axis=1) a = tf.nn.softmax(a, axis=-1) # attend y = tf.matmul(v, a, transpose_b=True) # bhct <- bhcs @ (bhst <- bhts) # posttransformation return layer_aff(tf.reshape(y, (b, d, t)), dim, name='p') # bdt <- bdt <- bhct
def infer(self): """-> Model with new fields, autoregressive len_tgt : i32 () steps to unfold aka t pred : i32 (b, t) prediction, hard """ dropout = identity with scope('infer'): with scope('encode'): w = self.position(self.max_src) + self.emb_src(self.src) w = self.encode(w, self.mask_src, dropout) # bds with scope('decode'): cap = placeholder(tf.int32, (), self.cap) msk = tf.log(tf.expand_dims(causal_mask(cap), axis= 0)) # 1tt pos = self.position(cap) # dt i,q = tf.constant(0), tf.zeros_like(self.src[:,:1]) + self.bos def body(i, q): j = i + 1 x = pos[:,:j] + self.emb_tgt(q) # bdj <- bj x = self.decode(x, msk[:,:j,:j], w, self.mask_src, dropout) # bdj p = tf.expand_dims( # b1 tf.argmax( # b self.emb_tgt( # bn tf.squeeze( # bd x[:,:,-1:] # bd1 <- bdj , axis= -1)) , axis= -1, output_type= tf.int32) , axis= -1) return j, tf.concat((q, p), axis= -1) # bk <- bj, b1 cond = lambda i, q: ((i < cap) & ~ tf.reduce_all(tf.equal(q[:,-1], self.eos))) _, p = tf.while_loop(cond, body, (i, q), back_prop= False, swap_memory= True) pred = p[:,1:] return Model(self, len_tgt= cap, pred= pred)
def body(i, q): j = i + 1 x = pos[:,:j] + self.emb_tgt(q) # bdj <- bj x = self.decode(x, msk[:,:j,:j], w, self.mask_src, dropout) # bdj p = tf.expand_dims( # b1 tf.argmax( # b self.emb_tgt( # bn tf.squeeze( # bd x[:,:,-1:] # bd1 <- bdj , axis= -1)) , axis= -1, output_type= tf.int32) , axis= -1) return j, tf.concat((q, p), axis= -1) # bk <- bj, b1
def data(self, sid, tid, src= None, tgt= None): """-> Model with new fields position : Sinusoid src_ : i32 (b, ?) source feed, in range `[0, dim_src)` tgt_ : i32 (b, ?) target feed, in range `[0, dim_tgt)` src : i32 (b, s) source with `eos` trimmed among the batch tgt : i32 (b, t) target with `eos` trimmed among the batch, padded with `bos` mask : b8 (b, t) target sequence mask true : i32 (?,) target references max_tgt : i32 () maximum target length max_src : i32 () maximum source length mask_tgt : f32 (1, t, t) target attention mask mask_src : f32 (b, 1, s) source attention mask """ src_ = placeholder(tf.int32, (None, None), src, 'src_') tgt_ = placeholder(tf.int32, (None, None), tgt, 'tgt_') with scope('src'): src, msk, max_src = trim(src_, self.eos) mask_src = tf.log(tf.expand_dims(tf.to_float(msk), axis= 1)) with scope('tgt'): tgt, msk, max_tgt = trim(tgt_, self.eos) mask = tf.pad(msk, ((0,0),(1,0)), constant_values= True) btru = tf.pad(tgt, ((0,0),(1,0)), constant_values= self.bos) true = tf.pad(tgt, ((0,0),(0,1)), constant_values= self.eos) true, tgt = tf.boolean_mask(true, mask), btru max_tgt += 1 mask_tgt = tf.log(tf.expand_dims(causal_mask(max_tgt), axis= 0)) return Model( position= Sinusoid(self.dim_emb, self.cap) , src_= src_, mask_src= mask_src, max_src= max_src, src= src , tgt_= tgt_, mask_tgt= mask_tgt, max_tgt= max_tgt, tgt= tgt , true= true, mask= mask , emb_src = self.embeds[sid] , emb_tgt = self.embeds[tid] , **self)
def data(self, src=None, tgt=None, len_cap=None): """-> Transformer with new fields src_ : i32 (b, ?) source feed, in range `[0, dim_src)` tgt_ : i32 (b, ?) target feed, in range `[0, dim_tgt)` src : i32 (b, s) source with `end` trimmed among the batch tgt : i32 (b, t) target with `end` trimmed among the batch mask : f32 (b, s) source mask gold : i32 (b, t) target one step ahead position : Sinusoid setting `len_cap` makes it more efficient for training. you won't be able to feed it longer sequences, but it doesn't affect any model parameters. """ end, dim = self.end, self.dim count_not_all = lambda x: tf.reduce_sum( tf.to_int32(~tf.reduce_all(x, 0))) with tf.variable_scope('src'): src = src_ = placeholder(tf.int32, (None, None), src) len_src = count_not_all(tf.equal(src, end)) src = src[:, :len_src] with tf.variable_scope('tgt'): tgt = tgt_ = placeholder(tf.int32, (None, None), tgt) len_tgt = count_not_all(tf.equal(tgt, end)) tgt, gold = tgt[:, :len_tgt], tgt[:, 1:1 + len_tgt] return Transformer(position=Sinusoid(dim, len_cap), src_=src_, src=src, mask=tf.to_float( tf.expand_dims(tf.not_equal(src, end), 1)), tgt_=tgt_, tgt=tgt, gold=gold, **self)
def vAe( mode, src=None, tgt=None, # model spec dim_tgt=8192, dim_emb=512, dim_rep=1024, rnn_layers=3, bidirectional=True, bidir_stacked=True, attentive=False, logit_use_embed=True, # training spec accelerate=1e-4, learn_rate=1e-3, bos=2, eos=1): # dim_tgt : vocab size # dim_emb : model dimension # dim_rep : representation dimension # # unk=0 for word dropout assert mode in ('train', 'valid', 'infer') self = Record(bos=bos, eos=eos) with scope('step'): step = self.step = tf.train.get_or_create_global_step() rate = accelerate * tf.to_float(step) rate_keepwd = self.rate_keepwd = tf.sigmoid(rate) rate_anneal = self.rate_anneal = tf.tanh(rate) rate_update = self.rate_update = learn_rate / (tf.sqrt(rate) + 1.0) with scope('src'): src = self.src = placeholder(tf.int32, (None, None), src, 'src') src = tf.transpose(src) # time major order src, msk_src, len_src = trim(src, eos) with scope('tgt'): tgt = self.tgt = placeholder(tf.int32, (None, None), tgt, 'tgt') tgt = tf.transpose(tgt) # time major order tgt, msk_tgt, len_tgt = trim(tgt, eos) msk_tgt = tf.pad(msk_tgt, ((1, 0), (0, 0)), constant_values=True) # pads for decoder : lead=[bos]+tgt -> gold=tgt+[eos] lead, gold = tgt, tf.pad(tgt, paddings=((0, 1), (0, 0)), constant_values=eos) if 'train' == mode: lead *= tf.to_int32( tf.random_uniform(tf.shape(lead)) < rate_keepwd) lead = self.lead = tf.pad(lead, paddings=((1, 0), (0, 0)), constant_values=bos) # s : src length # t : tgt length plus one padding, either eos or bos # b : batch size # # len_src : b aka s # msk_src : sb without padding # msk_tgt : tb with eos # # lead : tb with bos # gold : tb with eos with scope('embed'): b = (6 / (dim_tgt / dim_emb + 1))**0.5 embedding = tf.get_variable('embedding', (dim_tgt, dim_emb), initializer=tf.random_uniform_initializer( -b, b)) emb_tgt = tf.gather(embedding, lead, name='emb_tgt') # (t, b) -> (t, b, dim_emb) emb_src = tf.gather(embedding, src, name='emb_src') # (s, b) -> (s, b, dim_emb) with scope('encode'): # (s, b, dim_emb) -> (b, dim_emb) reverse = partial(tf.reverse_sequence, seq_lengths=len_src, seq_axis=0, batch_axis=1) if bidirectional and bidir_stacked: for i in range(rnn_layers): with scope("rnn{}".format(i + 1)): emb_fwd, _ = layer_rnn(1, dim_emb, name='fwd')(emb_src) emb_bwd, _ = layer_rnn(1, dim_emb, name='bwd')(reverse(emb_src)) hs = emb_src = tf.concat((emb_fwd, reverse(emb_bwd)), axis=-1) elif bidirectional: with scope("rnn"): emb_fwd, _ = layer_rnn(rnn_layers, dim_emb, name='fwd')(emb_src) emb_bwd, _ = layer_rnn(rnn_layers, dim_emb, name='bwd')(reverse(emb_src)) hs = tf.concat((emb_fwd, reverse(emb_bwd)), axis=-1) else: hs, _ = layer_rnn(rnn_layers, dim_emb, name='rnn')(emb_src) with scope('cata'): # extract the final states from the outputs: bd <- sbd, b2 h = tf.gather_nd( hs, tf.stack( (len_src - 1, tf.range(tf.size(len_src), dtype=tf.int32)), axis=1)) if attentive: # todo fixme # the values are the outputs from all non-padding steps; # the queries are the final states; h = layer_nrm(h + tf.squeeze( # bd <- bd1 attention( # bd1 <- bd1, bds, b1s tf.expand_dims(h, axis=2), # query: bd1 <- bd tf.transpose(hs, (1, 2, 0)), # value: bds <- sbd tf.log( tf.to_float( # -inf,0 mask: b1s <- sb <- bs tf.expand_dims(tf.transpose(msk_src), axis=1))), int(h.shape[-1])), 2)) with scope('latent'): # (b, dim_emb) -> (b, dim_rep) -> (b, dim_emb) # h = layer_aff(h, dim_emb, name='in') mu = self.mu = layer_aff(h, dim_rep, name='mu') lv = self.lv = layer_aff(h, dim_rep, name='lv') with scope('z'): h = mu if 'train' == mode: h += tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv)) self.z = h h = layer_aff(h, dim_emb, name='ex') with scope('decode'): # (b, dim_emb) -> (t, b, dim_emb) -> (?, dim_emb) h = self.state_in = tf.stack((h, ) * rnn_layers) h, _ = _, (self.state_ex, ) = layer_rnn(rnn_layers, dim_emb, name='rnn')( emb_tgt, initial_state=(h, )) if 'infer' != mode: h = tf.boolean_mask(h, msk_tgt) h = layer_aff(h, dim_emb, name='out') with scope('logits'): # (?, dim_emb) -> (?, dim_tgt) if logit_use_embed: logits = self.logits = tf.tensordot(h, (dim_emb**-0.5) * tf.transpose(embedding), 1) else: logits = self.logits = layer_aff(h, dim_tgt) with scope('prob'): prob = self.prob = tf.nn.softmax(logits) with scope('pred'): pred = self.pred = tf.argmax(logits, -1, output_type=tf.int32) if 'infer' != mode: labels = tf.boolean_mask(gold, msk_tgt, name='labels') with scope('errt'): errt_samp = self.errt_samp = tf.to_float(tf.not_equal( labels, pred)) errt = self.errt = tf.reduce_mean(errt_samp) with scope('loss'): with scope('loss_gen'): loss_gen_samp = self.loss_gen_samp = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits) loss_gen = self.loss_gen = tf.reduce_mean(loss_gen_samp) with scope('loss_kld'): loss_kld_samp = self.loss_kld_samp = 0.5 * ( tf.square(mu) + tf.exp(lv) - lv - 1.0) loss_kld = self.loss_kld = tf.reduce_mean(loss_kld_samp) loss = self.loss = rate_anneal * loss_kld + loss_gen if 'train' == mode: with scope('train'): train_step = self.train_step = tf.train.AdamOptimizer( rate_update).minimize(loss, step) return self
def model(mode, src_dwh, tgt_dwh, src_idx=None, len_src=None, tgt_img=None, tgt_idx=None, len_tgt=None, num_layers=3, num_units=512, learn_rate=1e-3, decay_rate=1e-2, dropout=0.1): assert mode in ('train', 'valid', 'infer') self = Record() src_d, src_w, src_h = src_dwh tgt_d, tgt_w, tgt_h = tgt_dwh with scope('source'): # input nodes src_idx = self.src_idx = placeholder(tf.int32, (None, None), src_idx, 'src_idx') # n s len_src = self.len_src = placeholder(tf.int32, (None, ), len_src, 'len_src') # n # time major order src_idx = tf.transpose(src_idx, (1, 0)) # s n emb_src = tf.one_hot(src_idx, src_d) # s n v for i in range(num_layers): with scope("rnn{}".format(i + 1)): emb_fwd, _ = tf.contrib.cudnn_rnn.CudnnGRU( 1, num_units, dropout=dropout, name='fwd')(emb_src, training='train' == mode) emb_bwd, _ = tf.contrib.cudnn_rnn.CudnnGRU( 1, num_units, dropout=dropout, name='bwd')(tf.reverse_sequence(emb_src, len_src, seq_axis=0, batch_axis=1), training='train' == mode) emb_src = tf.concat( (emb_fwd, tf.reverse_sequence( emb_bwd, len_src, seq_axis=0, batch_axis=1)), axis=-1) # emb_src = tf.layers.dense(emb_src, num_units, name= 'reduce_concat') # s n d emb_src = self.emb_src = tf.transpose(emb_src, (1, 2, 0)) # n d s with scope('target'): # input nodes tgt_img = self.tgt_img = placeholder(tf.uint8, (None, None, tgt_h, tgt_w), tgt_img, 'tgt_img') # n t h w tgt_idx = self.tgt_idx = placeholder(tf.int32, (None, None), tgt_idx, 'tgt_idx') # n t len_tgt = self.len_tgt = placeholder(tf.int32, (None, ), len_tgt, 'len_tgt') # n # time major order tgt_idx = tf.transpose(tgt_idx) # t n tgt_img = tf.transpose(tgt_img, (1, 0, 2, 3)) # t n h w tgt_img = flatten(tgt_img, 2, 3) # t n hw # normalize pixels to binary tgt_img = tf.to_float(tgt_img) / 255.0 # tgt_img = tf.round(tgt_img) # todo consider adding noise # causal padding fire = self.fire = tf.pad(tgt_img, ((1, 0), (0, 0), (0, 0)), constant_values=0.0) true = self.true = tf.pad(tgt_img, ((0, 1), (0, 0), (0, 0)), constant_values=1.0) tidx = self.tidx = tf.pad(tgt_idx, ((0, 1), (0, 0)), constant_values=1) mask_tgt = tf.transpose(tf.sequence_mask(len_tgt + 1)) # t n with scope('decode'): # needs to get input from latent space to do attention or some shit decoder = self.decoder = tf.contrib.cudnn_rnn.CudnnGRU(num_layers, num_units, dropout=dropout) state_in = self.state_in = tf.zeros( (num_layers, tf.shape(fire)[1], num_units)) x, _ = _, (self.state_ex, ) = decoder(fire, initial_state=(state_in, ), training='train' == mode) # transform mask to -inf and 0 in order to simply sum for whatever the f**k happens next mask = tf.log(tf.sequence_mask(len_src, dtype=tf.float32)) # n s mask = tf.expand_dims(mask, 1) # n 1 s # multi-head scaled dot-product attention x = tf.transpose(x, (1, 2, 0)) # t n d ---> n d t attn = Attention(num_units, num_units, 2 * num_units)(x, emb_src, mask) if 'train' == mode: attn = tf.nn.dropout(attn, 1 - dropout) x = Normalize(num_units)(x + attn) x = tf.transpose(x, (2, 0, 1)) # n d t ---> t n d if 'infer' != mode: x = tf.boolean_mask(x, mask_tgt) true = tf.boolean_mask(true, mask_tgt) tidx = tf.boolean_mask(tidx, mask_tgt) with scope('output'): y = tf.layers.dense(x, tgt_h * tgt_w, name='dense_img') z = tf.layers.dense(x, tgt_d, name='logit_idx') pred = self.pred = tf.clip_by_value(y, 0.0, 1.0) prob = self.prob = tf.nn.softmax(z) pidx = self.pidx = tf.argmax(z, axis=-1, output_type=tf.int32) with scope('losses'): diff = true - pred mae = self.mae = tf.reduce_mean(tf.abs(diff), axis=-1) mse = self.mse = tf.reduce_mean(tf.square(diff), axis=-1) xid = self.xid = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=z, labels=tidx) err = self.err = tf.not_equal(tidx, pidx) loss = tf.reduce_mean(xid) with scope('update'): step = self.step = tf.train.get_or_create_global_step() lr = self.lr = learn_rate / (1.0 + decay_rate * tf.sqrt(tf.to_float(step))) if 'train' == mode: down = self.down = tf.train.AdamOptimizer(lr).minimize(loss, step) return self