def __call__(self, x, mask, dropout, name=None): with tf.variable_scope(name or self.name): with tf.variable_scope('att'): x = self.norm_att(x + dropout(self.att(x, x, mask))) with tf.variable_scope('fwd'): x = self.norm_fwd(x + dropout(self.fwd(x))) return x
def autoreg(i, x, vs, y, p): # i : () time step from 0 to t=len_tgt # x : (b, 1) x_i # v : (b, t, dim) attention values # y : (b, t, dim_tgt) logit over x one step ahead # p : (b, t) predictions with tf.variable_scope('emb_tgt'): x = pos[i] + dropout(emb_tgt.embed(x)) us = [] for dec, v in zip(decode, vs): with tf.variable_scope('cache_v'): v = tf.concat((v, x), 1) us.append(v) x = dec(x, v, w, mask, dropout) x = logit(x) with tf.variable_scope('cache_y'): y = tf.concat((y, x), 1) if random: with tf.variable_scope('sample'): x = tf.multinomial(tf.squeeze(x, 1), 1, output_dtype=tf.int32) else: x = tf.argmax(x, -1, output_type=tf.int32, name='argmax') with tf.variable_scope('cache_p'): p = tf.concat((p, x), 1) return i + 1, x, tuple(us), y, p
def forcing(self, trainable=True): """-> Transformer with new fields, teacher forcing output : f32 (b, t, dim_tgt) prediction on logit scale prob : f32 (b, t, dim_tgt) prediction, soft pred : i32 (b, t) prediction, hard loss : f32 () prediction loss acc : f32 () accuracy must be called after `data`. """ logit, dropout = self.logit, self.dropout if trainable else identity mask, position = self.mask, self.position src, emb_src, encode = self.src, self.emb_src, self.encode tgt, emb_tgt, decode = self.tgt, self.emb_tgt, self.decode with tf.variable_scope('emb_src_forcing'): w = position(tf.shape(src)[1]) + dropout(emb_src.embed(src)) with tf.variable_scope('emb_tgt_forcing'): x = position(tf.shape(tgt)[1]) + dropout(emb_tgt.embed(tgt)) with tf.variable_scope('encode_forcing'): for enc in encode: w = enc(w, mask, dropout) with tf.variable_scope('decode_forcing'): with tf.variable_scope('mask'): causal_mask = tf.linalg.LinearOperatorLowerTriangular( tf.ones((tf.shape(x)[1], ) * 2)).to_dense() for dec in decode: x = dec(x, x, w, mask, dropout, causal_mask) y = logit(x) p = tf.argmax(y, -1, output_type=tf.int32, name='pred') return Transformer(output=y, pred=p, **self)._eval()
def __init__(self, dim, dim_mid, act, name): with tf.variable_scope(name): self.name = name with tf.variable_scope('att'): self.att = Attention(dim, layer=Forward, mid=dim_mid, act=act) self.norm_att = Normalize(dim) with tf.variable_scope('fwd'): self.fwd = Forward(dim, dim, dim_mid, act) self.norm_fwd = Normalize(dim)
def __call__(self, x, v, w, m, dropout, mask=None, name=None): with tf.variable_scope(name or self.name): with tf.variable_scope('csl'): x = self.norm_csl(x + dropout(self.csl(x, v, mask))) with tf.variable_scope('att'): x = self.norm_att(x + dropout(self.att(x, w, m))) with tf.variable_scope('fwd'): x = self.norm_fwd(x + dropout(self.fwd(x))) return x
def encoder(x, dim_btlnk, dim_x): x = Normalize(dim_btlnk, "nrm")(tf.nn.elu(Linear(dim_btlnk, dim_x, name= 'lin')(x))) with tf.variable_scope('latent'): mu = Linear(dim_btlnk, dim_btlnk, name= 'mu')(x) lv = Linear(dim_btlnk, dim_btlnk, name= 'lv')(x) #lv = Linear(dim_btlnk, dim_x, name= 'lv')(x) #mu = Linear(dim_btlnk, dim_x, name= 'mu')(x) with tf.variable_scope('z'): z = mu + tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv)) return z, mu, lv
def _eval(self): gold, pred, output, smooth = self.gold, self.pred, self.output, self.smooth with tf.variable_scope('acc'): acc = tf.reduce_mean(tf.to_float(tf.equal(gold, pred))) with tf.variable_scope('loss'): loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2( logits=output, labels=smooth(gold))) with tf.variable_scope('prob'): prob = tf.nn.softmax(output, name='prob') return Transformer(prob=prob, loss=loss, acc=acc, **self)
def new(end=1, dim_src=256, dim=256, dim_tgt=256, dim_mid=512, num_layer=2, logit_share_embedding=False, act=tf.nn.relu, smooth=0.1, dropout=0.1): """-> Transformer with fields end : i32 () emb_src : Linear emb_tgt : Linear encode : tuple EncodeBlock decode : tuple DecodeBlock logit : Affine smooth : Smooth dropout : Dropout `end` is treated as the padding for both source and target. """ assert not dim % 2 emb_src = Linear(dim, dim_src, 'emb_src') emb_tgt = Linear(dim, dim_tgt, 'emb_tgt') with tf.variable_scope('encode'): encode = tuple( EncodeBlock(dim, dim_mid, act, "layer{}".format(1 + i)) for i in range(num_layer)) with tf.variable_scope('decode'): decode = tuple( DecodeBlock(dim, dim_mid, act, "layer{}".format(1 + i)) for i in range(num_layer)) return Transformer( dim=dim, dim_tgt=dim_tgt, end=tf.constant(end, tf.int32, (), 'end'), emb_src=emb_src, encode=encode, emb_tgt=emb_tgt, decode=decode, logit=emb_tgt.transpose('logit') if logit_share_embedding else Affine(dim_tgt, dim, 'logit'), smooth=Smooth(smooth, dim_tgt), dropout=Dropout(dropout, (None, None, dim)))
def data(self, src=None, tgt=None, len_cap=None): """-> Transformer with new fields src_ : i32 (b, ?) source feed, in range `[0, dim_src)` tgt_ : i32 (b, ?) target feed, in range `[0, dim_tgt)` src : i32 (b, s) source with `end` trimmed among the batch tgt : i32 (b, t) target with `end` trimmed among the batch mask : f32 (b, s) source mask gold : i32 (b, t) target one step ahead position : Sinusoid setting `len_cap` makes it more efficient for training. you won't be able to feed it longer sequences, but it doesn't affect any model parameters. """ end, dim = self.end, self.dim count_not_all = lambda x: tf.reduce_sum( tf.to_int32(~tf.reduce_all(x, 0))) with tf.variable_scope('src'): src = src_ = placeholder(tf.int32, (None, None), src) len_src = count_not_all(tf.equal(src, end)) src = src[:, :len_src] with tf.variable_scope('tgt'): tgt = tgt_ = placeholder(tf.int32, (None, None), tgt) len_tgt = count_not_all(tf.equal(tgt, end)) tgt, gold = tgt[:, :len_tgt], tgt[:, 1:1 + len_tgt] return Transformer(position=Sinusoid(dim, len_cap), src_=src_, src=src, mask=tf.to_float( tf.expand_dims(tf.not_equal(src, end), 1)), tgt_=tgt_, tgt=tgt, gold=gold, **self)
def train(self, warmup=4e3, beta1=0.9, beta2=0.98, epsilon=1e-9): """-> Transformer with new fields step : i64 () global update step lr : f32 () learning rate for the current step up : update operation """ dim, loss = self.dim, self.loss with tf.variable_scope('lr'): s = tf.train.get_or_create_global_step() t = tf.to_float(s + 1) lr = (dim**-0.5) * tf.minimum(t**-0.5, t * (warmup**-1.5)) up = tf.train.AdamOptimizer(lr, beta1, beta2, epsilon).minimize(loss, s) return Transformer(step=s, lr=lr, up=up, **self)
def ae(data, btlnk_dim, data_dim, dense_dim, y_dim, loss_type): def encoder(x, btlnk_dim): x = normalize( tf.nn.relu(tf.keras.layers.Dense(btlnk_dim, use_bias=False)(x)), "layer_norm_1") return x def decoder(x, data_dim): x = tf.keras.layers.Dense(data_dim, use_bias=False)(x) #return tf.clip_by_value(x, 0.0, 1.0) return tf.sigmoid(x) with tf.variable_scope("x"): x = placeholder(tf.float32, [None, data_dim], data[0], "x") with tf.variable_scope("y"): y = placeholder(tf.float32, [None], data[1], "y") with tf.variable_scope("encoder"): z = encoder(x, btlnk_dim) with tf.variable_scope("decoder"): logits = decoder(z, data_dim) with tf.variable_scope("loss"): if loss_type == "xtrpy": #loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=x, logits=logits)) epsilon = 1e-10 loss = tf.reduce_mean( -tf.reduce_sum(x * tf.log(epsilon + logits) + (1 - x) * tf.log(epsilon + 1 - logits), axis=1)) else: loss = tf.reduce_mean(tf.abs(x - logits)) step = tf.train.get_or_create_global_step() with tf.variable_scope("AUC"): anomaly_score = tf.reduce_mean((x - logits)**2, axis=1) _, auc = tf.metrics.auc(y, anomaly_score) with tf.variable_scope("train_step"): train_step = tf.train.AdamOptimizer().minimize(loss, step) return dict(step=step, x=x, y=y, logits=logits, auc=auc, train_step=train_step, loss=loss)
def sinusoid(time, dim, freq=1e-4, name='sinusoid', scale=True, array=False): """returns a rank-2 tensor of shape `time, dim`, where each row corresponds to a time step and each column a sinusoid, with frequencies in a geometric progression from 1 to `freq`. """ assert not dim % 2 if array: a = (freq**((2 / dim) * np.arange(dim // 2))).reshape( -1, 1) @ np.arange(time).reshape(1, -1) s = np.concatenate((np.sin(a), np.cos(a)), -1).reshape(dim, time) if scale: s *= dim**-0.5 return s.T with tf.variable_scope(name): a = tf.reshape( freq**((2 / dim) * tf.range(dim // 2, dtype=tf.float32)), (-1, 1)) @ tf.reshape( tf.range(tf.cast(time, tf.float32), dtype=tf.float32), (1, -1)) s = tf.reshape(tf.concat((tf.sin(a), tf.cos(a)), -1), (dim, time)) if scale: s *= dim**-0.5 return tf.transpose(s)
def __call__(self, time, name=None): with tf.variable_scope(name or self.name): return sinusoid(time, self.dim) if self.pos is None else self.pos[:time]
def autoreg(self, trainable=False, random=False, minimal=True): """-> Transformer with new fields, autoregressive len_tgt : i32 () steps to unfold aka t output : f32 (b, t, dim_tgt) prediction on logit scale prob : f32 (b, t, dim_tgt) prediction, soft pred : i32 (b, t) prediction, hard loss : f32 () prediction loss acc : f32 () accuracy must be called after `data`. """ assert not trainable or not random assert not trainable or not minimal end, dim_tgt, logit = self.end, self.dim_tgt, self.logit dropout = self.dropout if trainable else identity mask, position = self.mask, self.position src, emb_src, encode = self.src, self.emb_src, self.encode tgt, emb_tgt, decode = self.tgt, self.emb_tgt, self.decode with tf.variable_scope('emb_src_autoreg'): w = position(tf.shape(src)[1]) + dropout(emb_src.embed(src)) with tf.variable_scope('encode_autoreg'): for enc in encode: w = enc(w, mask, dropout) with tf.variable_scope('decode_autoreg'): with tf.variable_scope('init'): len_tgt = tf.shape(tgt)[1] pos = position(len_tgt) i = tf.constant(0) x = tgt[:, :1] v = w[:, :0] y = tf.reshape(v, (tf.shape(v)[0], 0, dim_tgt)) p = x[:, 1:] def autoreg(i, x, vs, y, p): # i : () time step from 0 to t=len_tgt # x : (b, 1) x_i # v : (b, t, dim) attention values # y : (b, t, dim_tgt) logit over x one step ahead # p : (b, t) predictions with tf.variable_scope('emb_tgt'): x = pos[i] + dropout(emb_tgt.embed(x)) us = [] for dec, v in zip(decode, vs): with tf.variable_scope('cache_v'): v = tf.concat((v, x), 1) us.append(v) x = dec(x, v, w, mask, dropout) x = logit(x) with tf.variable_scope('cache_y'): y = tf.concat((y, x), 1) if random: with tf.variable_scope('sample'): x = tf.multinomial(tf.squeeze(x, 1), 1, output_dtype=tf.int32) else: x = tf.argmax(x, -1, output_type=tf.int32, name='argmax') with tf.variable_scope('cache_p'): p = tf.concat((p, x), 1) return i + 1, x, tuple(us), y, p _, _, _, y, p = tf.while_loop( lambda i, x, *_: ( (i < len_tgt) & ~tf.reduce_all(tf.equal(x, end))) if minimal else (i < len_tgt), autoreg, (i, x, (v, ) * len(decode), y, p), (i.shape, x.shape, (v.shape, ) * len(decode), tf.TensorShape((None, None, dim_tgt)), p.shape), back_prop=trainable, swap_memory=True, name='autoreg') return Transformer(len_tgt=len_tgt, output=y, pred=p, **self)._eval()
def VAE(data, btlnk_dim, data_dim, dense_dim, y_dim, loss_type, accelerate): def encoder(x, dim_btlnk, dim_x): x = Normalize(dim_btlnk, "nrm")(tf.nn.elu(Linear(dim_btlnk, dim_x, name= 'lin')(x))) with tf.variable_scope('latent'): mu = Linear(dim_btlnk, dim_btlnk, name= 'mu')(x) lv = Linear(dim_btlnk, dim_btlnk, name= 'lv')(x) #lv = Linear(dim_btlnk, dim_x, name= 'lv')(x) #mu = Linear(dim_btlnk, dim_x, name= 'mu')(x) with tf.variable_scope('z'): z = mu + tf.exp(0.5 * lv) * tf.random_normal(shape=tf.shape(lv)) return z, mu, lv def decoder(x, data_dim, btlnk_dim): x = Linear(data_dim, btlnk_dim)(x) #return tf.clip_by_value(x, 0.0, 1.0) return tf.nn.sigmoid(x) with tf.variable_scope("x"): x = placeholder(tf.float32, [None, data_dim], data[0], "x") with tf.variable_scope("y"): y = placeholder(tf.float32, [None], data[1], "y") with tf.variable_scope("encoder"): z, mu, lv = encoder(x, btlnk_dim, data_dim) with tf.variable_scope("decoder"): logits = decoder(z, data_dim, btlnk_dim) with tf.variable_scope("step"): step = tf.train.get_or_create_global_step() rate = accelerate * tf.to_float(step) rate_anneal = tf.tanh(rate) with tf.variable_scope("loss"): kl_loss = tf.reduce_mean(0.5 * (tf.square(mu) + tf.exp(lv) - lv - 1.0)) if loss_type == "xtrpy": #loss_rec = tf.reduce_mean(tf.losses.softmax_cross_entropy(onehot_labels=x, logits=logits)) epsilon = 1e-10 loss_rec = tf.reduce_mean(-tf.reduce_sum(x * tf.log(epsilon+logits) + (1-x) * tf.log(epsilon+1-logits), axis=1)) else: loss_rec = tf.reduce_mean(tf.abs(x - logits)) loss = loss_rec + kl_loss*rate_anneal with tf.variable_scope("AUC"): anomaly_score = tf.reduce_mean((x-logits)**2, axis=1) _, auc = tf.metrics.auc(y, anomaly_score) with tf.variable_scope("train_step"): train_step = tf.train.AdamOptimizer().minimize(loss, step) return dict(step=step, x=x, y=y, z=z, mu=mu, logits=logits, auc=auc, train_step=train_step, loss=loss, kl_loss=kl_loss, loss_rec=loss_rec)
def build(self, x, y, lr_max, mult): with tf.variable_scope("x"): x = placeholder(tf.float32, [None, self.dim_x], x, "x") with tf.variable_scope("y"): y = placeholder(tf.float32, [None], y, "y") gx = self.gen(x) dx, dgx = self.dis(x), self.dis(gx) with tf.variable_scope("loss"): a = tf.reduce_mean(tf.abs(x - dx)) b = tf.reduce_mean(tf.abs(gx - dgx)) c = tf.reduce_mean(tf.abs(x - gx)) d_vs_g = a - (b + c) / 2 # for balancing the learnign rate lr_d = sigmoid(d_vs_g, mult=mult) lr_g = (tf.constant(1.0) - lr_d) * lr_max lr_d = lr_d * lr_max # balance parameter for discriminator caring more about autoencoding real, or discriminating fake sigma = 0.5 w_fake = tf.clip_by_value( sigmoid(b * sigma - a, shift=0., mult=mult), 0., 0.9 ) # hold the discrim proportion fake aways at less than half d_loss = a - b * w_fake # weights for generator wg_fake = tf.clip_by_value(sigmoid(b - c, shift=0., mult=mult), 0., 1.0) wg_reconstruct = 1 - wg_fake g_loss = b * wg_fake + c * wg_reconstruct with tf.variable_scope("AUC"): _, auc_dgx = tf.metrics.auc(y, tf.reduce_mean((x - dgx)**2, axis=1)) _, auc_dx = tf.metrics.auc(y, tf.reduce_mean((x - dx)**2, axis=1)) _, auc_gx = tf.metrics.auc(y, tf.reduce_mean((x - gx)**2, axis=1)) with scope('down'): g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") step = tf.train.get_or_create_global_step() d_step = tf.train.AdamOptimizer(lr_d).minimize(d_loss, step, var_list=d_vars) g_step = tf.train.AdamOptimizer(lr_g).minimize(g_loss, step, var_list=g_vars) return DAE(self, step=step, x=x, y=y, gx=gx, dgx=dgx, dx=dx, auc_dgx=auc_dgx, auc_gx=auc_gx, auc_dx=auc_dx, g_loss=g_loss, d_loss=d_loss, d_step=d_step, g_step=g_step)
def build(self, x, y, z, loss_type): d_scale_factor = tf.constant(0.) #tf.constant(0.25) g_scale_factor = tf.constant(0.) #tf.constant(1 - 0.75/2) with scope("x"): x = placeholder(tf.float32, [None, self.dim_x], x, "x") with scope("y"): y = placeholder(tf.float32, [None], y, "y") with scope("z"): z = placeholder(tf.float32, [None, self.dim_noise], z, "z") zx, mu, lv, hl_e = self.enc(x) gzx = self.gen(zx) #gz = self.gen(z) dx, hl_dx = self.dis(x) dgzx, hl_dgzx = self.dis(gzx) #dgz, hl_dgz = self.dis(gz) with tf.variable_scope("step"): step = tf.train.get_or_create_global_step() rate = self.accelerate * tf.to_float(step) rate_anneal = tf.tanh(rate) with scope("loss"): dx_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(dx) - d_scale_factor, logits=dx)) dgzx_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.zeros_like(dgzx), logits=dgzx)) #dgz_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(dgz), logits=dgz)) d_loss = dx_loss + dgzx_loss #+ dgz_loss kl_loss = tf.reduce_mean(0.5 * (tf.square(mu) + tf.exp(lv) - lv - 1.0)) gzx_loss = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( labels=tf.ones_like(dgzx) - g_scale_factor, logits=dgzx)) if loss_type == "xtrpy": epsilon = 1e-10 ftr_loss = tf.reduce_mean( -tf.reduce_sum(x * tf.log(epsilon + gzx) + (1 - x) * tf.log(epsilon + 1 - gzx), axis=1)) g_loss = gzx_loss / 10 + ftr_loss / 5 + kl_loss * rate_anneal else: ftr_loss = tf.reduce_mean(tf.abs(x - gzx)) g_loss = gzx_loss / 2 + ftr_loss * 10 + kl_loss * rate_anneal with scope("AUC"): _, auc_gzx = tf.metrics.auc(y, tf.reduce_mean((x - gzx)**2, axis=1)) _, auc_dx = tf.metrics.auc(y, tf.nn.sigmoid(dx)) _, auc_dgzx = tf.metrics.auc(y, tf.nn.sigmoid(dgzx)) g_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="generator") g_vars.append( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="encoder")) print(g_vars) d_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="discriminator") print(d_vars) with scope('train_step'): #optimizer = tf.train.RMSPropOptimizer() optimizer = tf.train.AdamOptimizer() d_step = optimizer.minimize(d_loss, step, var_list=d_vars) g_step = optimizer.minimize(g_loss, step, var_list=g_vars) return VAEGAN( self, step=step, x=x, y=y, z=z, zx=zx, mu=mu, lv=lv, m=tf.reduce_mean(mu), l=tf.reduce_mean(lv) #, gz=gz , gzx=gzx, auc_gzx=auc_gzx, auc_dx=auc_dx, auc_dgzx=auc_dgzx, g_step=g_step, d_step=d_step, g_loss=g_loss, d_loss=d_loss #,gz_loss=gz_loss , gzx_loss=gzx_loss, ftr_loss=ftr_loss, kl_loss=kl_loss, dx_loss=dx_loss #, dgz_loss=dgz_loss , dgzx_loss=dgzx_loss)