def __init__(self, bert_base, bow_vocab_size, latent_distrib='vmf', n_latent=256, max_sent_len=32, kappa=100.0, batch_size=16, kld=0.1, wd_freqs=None, ctx=mx.cpu(), prefix=None, params=None): super(BertBowVED, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.batch_size = batch_size self.bow_vocab_size = bow_vocab_size self.latent_distrib = latent_distrib self.kappa = kappa with self.name_scope(): self.encoder = bert_base if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0, var=0.05) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.bow_vocab_size, activation=None) self.latent_dist.initialize(mx.init.Xavier(), ctx=self.model_ctx) self.decoder.initialize(mx.init.Xavier(), ctx=self.model_ctx) if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data()
def __init__(self, vocabulary, enc_dim, n_latent, embedding_size, fixed_embedding=False, latent_distrib='logistic_gaussian', init_l1=0.0, coherence_reg_penalty=0.0, redundancy_reg_penalty=0.0, kappa=100.0, alpha=1.0, target_sparsity=0.0, batch_size=None, n_encoding_layers=1, enc_dr=0.1, wd_freqs=None, seed_mat=None, n_covars=0, ctx=mx.cpu()): super(BowNTM, self).__init__() self.batch_size = batch_size self._orig_batch_size = batch_size self.n_latent = n_latent self.model_ctx = ctx self.vocab_size = len(vocabulary) self.coherence_reg_penalty = coherence_reg_penalty self.redundancy_reg_penalty = redundancy_reg_penalty self.embedding_size = embedding_size self.target_sparsity = target_sparsity self.vocabulary = vocabulary self.num_enc_layers = n_encoding_layers if vocabulary.embedding: assert vocabulary.embedding.idx_to_vec[0].size == embedding_size self.encoding_dims = [self.embedding_size + n_covars ] + [enc_dim for _ in range(n_encoding_layers)] with self.name_scope(): self.l1_pen_const = self.params.get('l1_pen_const', shape=(1, ), init=mx.init.Constant( [init_l1]), differentiable=False) ## Add in topic seed constraints self.seed_matrix = seed_mat ## should be tanh here to avoid losing embedding information self.embedding = gluon.nn.Dense(in_units=self.vocab_size, units=self.embedding_size, activation='tanh') self.encoder = self._get_encoder(self.encoding_dims, dr=enc_dr) #self.encoder = gluon.nn.Dense(in_units=(self.embedding_size + n_covars), # units = enc_dim, activation='softrelu') ## just single FC layer 'encoder' if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution( n_latent, ctx, alpha=alpha) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution( n_latent, ctx) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.vocab_size, activation=None) self.coherence_regularization = CoherenceRegularizer( self.coherence_reg_penalty, self.redundancy_reg_penalty) self.initialize(mx.init.Xavier(), ctx=self.model_ctx) if vocabulary.embedding: emb = vocabulary.embedding.idx_to_vec.transpose() emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10 emb_norm = emb / emb_norm_val self.embedding.weight.set_data(emb_norm) if fixed_embedding: self.embedding.collect_params().setattr('grad_req', 'null') ## Initialize and FIX decoder bias terms to corpus frequencies if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data()
class BowNTM(HybridBlock): """ Parameters ---------- vocabulary : int size of the vocabulary enc_dim : int number of dimension of input encoder (first FC layer) n_latent : int number of dimensions of the latent dimension (i.e. number of topics) gen_layers : int (default = 3) number of generator layers (after sample); size is the same as n_latent batch_size : int (default None) provided only at training time (or when model is Hybridized) - otherwise will be inferred ctx : context device (default is mx.cpu()) """ def __init__(self, vocabulary, enc_dim, n_latent, embedding_size, fixed_embedding=False, latent_distrib='logistic_gaussian', init_l1=0.0, coherence_reg_penalty=0.0, redundancy_reg_penalty=0.0, kappa=100.0, alpha=1.0, target_sparsity=0.0, batch_size=None, n_encoding_layers=1, enc_dr=0.1, wd_freqs=None, seed_mat=None, n_covars=0, ctx=mx.cpu()): super(BowNTM, self).__init__() self.batch_size = batch_size self._orig_batch_size = batch_size self.n_latent = n_latent self.model_ctx = ctx self.vocab_size = len(vocabulary) self.coherence_reg_penalty = coherence_reg_penalty self.redundancy_reg_penalty = redundancy_reg_penalty self.embedding_size = embedding_size self.target_sparsity = target_sparsity self.vocabulary = vocabulary self.num_enc_layers = n_encoding_layers if vocabulary.embedding: assert vocabulary.embedding.idx_to_vec[0].size == embedding_size self.encoding_dims = [self.embedding_size + n_covars ] + [enc_dim for _ in range(n_encoding_layers)] with self.name_scope(): self.l1_pen_const = self.params.get('l1_pen_const', shape=(1, ), init=mx.init.Constant( [init_l1]), differentiable=False) ## Add in topic seed constraints self.seed_matrix = seed_mat ## should be tanh here to avoid losing embedding information self.embedding = gluon.nn.Dense(in_units=self.vocab_size, units=self.embedding_size, activation='tanh') self.encoder = self._get_encoder(self.encoding_dims, dr=enc_dr) #self.encoder = gluon.nn.Dense(in_units=(self.embedding_size + n_covars), # units = enc_dim, activation='softrelu') ## just single FC layer 'encoder' if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution( n_latent, ctx, alpha=alpha) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution( n_latent, ctx) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.vocab_size, activation=None) self.coherence_regularization = CoherenceRegularizer( self.coherence_reg_penalty, self.redundancy_reg_penalty) self.initialize(mx.init.Xavier(), ctx=self.model_ctx) if vocabulary.embedding: emb = vocabulary.embedding.idx_to_vec.transpose() emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10 emb_norm = emb / emb_norm_val self.embedding.weight.set_data(emb_norm) if fixed_embedding: self.embedding.collect_params().setattr('grad_req', 'null') ## Initialize and FIX decoder bias terms to corpus frequencies if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data() def _get_encoder(self, dims, dr=0.1): encoder = gluon.nn.HybridSequential() for i in range(len(dims) - 1): encoder.add( gluon.nn.Dense(in_units=dims[i], units=dims[i + 1], activation='softrelu')) if dr > 0.0: encoder.add(gluon.nn.Dropout(dr)) return encoder def get_top_k_terms(self, k): """ Returns the top K terms for each topic based on sensitivity analysis. Terms whose probability increases the most for a unit increase in a given topic score/probability are those most associated with the topic. """ z = mx.nd.ones(shape=(1, self.n_latent), ctx=self.model_ctx) jacobian = mx.nd.zeros(shape=(self.vocab_size, self.n_latent), ctx=self.model_ctx) z.attach_grad() for i in range(self.vocab_size): with mx.autograd.record(): y = self.decoder(z) yi = y[0][i] yi.backward() jacobian[i] = z.grad sorted_j = jacobian.argsort(axis=0, is_ascend=False) return sorted_j def encode_data(self, data): """ Encode data to the mean of the latent distribution defined by the input `data` """ return self.latent_dist.mu_encoder(self.encoder(self.embedding(data))) def get_l1_penalty_term(self, F, l1_pen_const, batch_size): if F is mx.ndarray: dec_weights = self.decoder.params.get('weight').data() else: dec_weights = self.decoder.params.get('weight').var() return l1_pen_const * F.sum(F.abs(dec_weights)) def add_coherence_reg_penalty(self, F, cur_loss): if self.coherence_reg_penalty > 0.0: if F is mx.ndarray: w = self.decoder.params.get('weight').data() emb = self.embedding.params.get('weight').data() else: w = self.decoder.params.get('weight').var() emb = self.embedding.params.get('weight').var() c, d = self.coherence_regularization(w, emb) return (cur_loss + c + d), c, d else: return (cur_loss, F.zeros_like(cur_loss), F.zeros_like(cur_loss)) def add_seed_constraint_loss(self, F, cur_loss): # G - number of seeded topics # S - number of seeds per topic # K - number of topics if self.seed_matrix is not None: if F is mx.ndarray: w = self.decoder.params.get('weight').data() else: w = self.decoder.params.get('weight').var() ts = F.take(w, self.seed_matrix) ## should have shape (G, S, K) ts_sums = F.sum(ts, axis=1) # now (G, K) ts_probs = F.softmax(ts_sums, axis=1) entropies = -F.sum( ts_probs * F.log(ts_probs)) ## want to minimize the entropy here ## Ensure seed terms have higher weights seed_means = F.mean(ts, axis=1) # (G,K) total_means = F.mean(w, axis=0) # (K,) pref_loss = F.relu( total_means - seed_means ) # penalty if mean weight for topic is greater than seed means # minimize weighted entropy over the seed means seed_pr = F.softmax(seed_means) per_topic_entropy = -F.sum(seed_pr * F.log(seed_pr), axis=0) seed_means_pr = F.sum(seed_pr, axis=0) per_topic_entropy = F.sum(seed_means_pr * per_topic_entropy) entropies = F.add(entropies, F.sum(pref_loss)) entropies = F.add(entropies, per_topic_entropy) return (F.broadcast_add(cur_loss, entropies), entropies) else: return (cur_loss, F.zeros_like(cur_loss)) def run_encode(self, F, in_data, batch_size): enc_out = self.encoder(in_data) return self.latent_dist(enc_out, batch_size) def get_loss_terms(self, F, data, y, KL, l1_pen_const, batch_size): l1_pen = self.get_l1_penalty_term(F, l1_pen_const, batch_size) rr = data * F.log(y + 1e-12) recon_loss = -F.sparse.sum(rr, axis=1) i_loss = F.broadcast_plus(recon_loss, F.broadcast_plus(l1_pen, KL)) ii_loss, coherence_loss, redundancy_loss = self.add_coherence_reg_penalty( F, i_loss) iii_loss, entropies = self.add_seed_constraint_loss(F, ii_loss) return iii_loss, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss def hybrid_forward(self, F, data, l1_pen_const=None): batch_size = data.shape[0] if F is mx.ndarray else self.batch_size emb_out = self.embedding(data) z, KL = self.run_encode(F, emb_out, batch_size) dec_out = self.decoder(z) y = F.softmax(dec_out, axis=1) iii_loss, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss = \ self.get_loss_terms(F, data, y, KL, l1_pen_const, batch_size) return iii_loss, KL, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss, y
def __init__(self, vocabulary, emb_dim, latent_distrib='vmf', num_units=512, hidden_size=512, num_heads=4, n_latent=256, max_sent_len=64, transformer_layers=6, label_smoothing_epsilon=0.0, kappa=100.0, batch_size=16, kld=0.1, wd_temp=0.01, ctx=mx.cpu(), prefix=None, params=None): super(PureTransformerVAE, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.vocabulary = vocabulary self.batch_size = batch_size self.wd_embed_dim = emb_dim self.vocab_size = len(vocabulary.idx_to_token) self.latent_distrib = latent_distrib self.num_units = num_units self.hidden_size = hidden_size self.num_heads = num_heads self.transformer_layers = transformer_layers self.label_smoothing_epsilon = label_smoothing_epsilon self.kappa = kappa with self.name_scope(): if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0, var=0.05) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim) self.encoder = TransformerEncoder(self.wd_embed_dim, self.num_units, hidden_size=hidden_size, num_heads=num_heads, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) self.decoder = TransformerDecoder(wd_embed_dim=self.wd_embed_dim, num_units=self.num_units, hidden_size=hidden_size, num_heads=num_heads, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) #self.out_embedding = gluon.nn.Embedding(input_dim=self.vocab_size, output_dim=self.wd_embed_dim) self.inv_embed = InverseEmbed(batch_size, max_sent_len, self.wd_embed_dim, temp=wd_temp, ctx=self.model_ctx, params=self.embedding.params) self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss( axis=-1, from_logits=True, sparse_label=True) #self.label_smoothing = LabelSmoothing(epsilon=label_smoothing_epsilon, units=self.vocab_size) self.embedding.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx) #self.out_embedding.initialize(mx.init.Uniform(0.1), ctx=ctx) #self.inv_embed.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx) if self.vocabulary.embedding: #self.out_embedding.weight.set_data(self.vocabulary.embedding.idx_to_vec) self.embedding.weight.set_data( self.vocabulary.embedding.idx_to_vec)
class PureTransformerVAE(Block): def __init__(self, vocabulary, emb_dim, latent_distrib='vmf', num_units=512, hidden_size=512, num_heads=4, n_latent=256, max_sent_len=64, transformer_layers=6, label_smoothing_epsilon=0.0, kappa=100.0, batch_size=16, kld=0.1, wd_temp=0.01, ctx=mx.cpu(), prefix=None, params=None): super(PureTransformerVAE, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.vocabulary = vocabulary self.batch_size = batch_size self.wd_embed_dim = emb_dim self.vocab_size = len(vocabulary.idx_to_token) self.latent_distrib = latent_distrib self.num_units = num_units self.hidden_size = hidden_size self.num_heads = num_heads self.transformer_layers = transformer_layers self.label_smoothing_epsilon = label_smoothing_epsilon self.kappa = kappa with self.name_scope(): if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0, var=0.05) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim) self.encoder = TransformerEncoder(self.wd_embed_dim, self.num_units, hidden_size=hidden_size, num_heads=num_heads, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) self.decoder = TransformerDecoder(wd_embed_dim=self.wd_embed_dim, num_units=self.num_units, hidden_size=hidden_size, num_heads=num_heads, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) #self.out_embedding = gluon.nn.Embedding(input_dim=self.vocab_size, output_dim=self.wd_embed_dim) self.inv_embed = InverseEmbed(batch_size, max_sent_len, self.wd_embed_dim, temp=wd_temp, ctx=self.model_ctx, params=self.embedding.params) self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss( axis=-1, from_logits=True, sparse_label=True) #self.label_smoothing = LabelSmoothing(epsilon=label_smoothing_epsilon, units=self.vocab_size) self.embedding.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx) #self.out_embedding.initialize(mx.init.Uniform(0.1), ctx=ctx) #self.inv_embed.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx) if self.vocabulary.embedding: #self.out_embedding.weight.set_data(self.vocabulary.embedding.idx_to_vec) self.embedding.weight.set_data( self.vocabulary.embedding.idx_to_vec) def __call__(self, wp_toks): return super(PureTransformerVAE, self).__call__(wp_toks) def set_kl_weight(self, epoch, max_epochs): burn_in = int(max_epochs / 10) eps = 1e-6 if epoch > burn_in: self.kld_wt = ((epoch - burn_in) / (max_epochs - burn_in)) + eps else: self.kld_wt = eps return self.kld_wt def encode(self, toks): embedded = self.embedding(toks) enc = self.encoder(embedded) return self.latent_dist.mu_encoder(enc) def forward(self, toks): embedded = self.embedding(toks) enc = self.encoder(embedded) z, KL = self.latent_dist(enc, self.batch_size) y = self.decoder(z) prob_logits = self.inv_embed(y) log_prob = mx.nd.log_softmax(prob_logits) recon_loss = self.ce_loss_fn(log_prob, toks) kl_loss = (KL * self.kld_wt) loss = recon_loss + kl_loss return loss, recon_loss, kl_loss, log_prob
def __init__(self, bert_base, latent_distrib='vmf', wd_embed_dim=300, num_units=512, n_latent=256, max_sent_len=64, transformer_layers=6, kappa=100.0, batch_size=16, kld=0.1, wd_temp=0.01, ctx=mx.cpu(), increasing=True, decreasing=False, prefix=None, params=None): super(BertTransVAE, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.bert = bert_base self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.batch_size = batch_size self.wd_embed_dim = wd_embed_dim self.latent_distrib = latent_distrib with self.name_scope(): if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, dr=0.0, ctx=self.model_ctx) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.decoder = TransformerDecoder(wd_embed_dim=wd_embed_dim, num_units=num_units, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) self.vocab_size = self.bert.word_embed[0].params.get( 'weight').shape[0] self.out_embedding = gluon.nn.Embedding( input_dim=self.vocab_size, output_dim=wd_embed_dim, weight_initializer=mx.init.Uniform(0.1)) self.inv_embed = InverseEmbed(batch_size, max_sent_len, self.wd_embed_dim, temp=wd_temp, ctx=self.model_ctx, params=self.out_embedding.params) self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss( axis=-1, from_logits=True)
class BowNTM(HybridBlock): """ Parameters ---------- vocabulary : int size of the vocabulary enc_dim : int number of dimension of input encoder (first FC layer) n_latent : int number of dimensions of the latent dimension (i.e. number of topics) gen_layers : int (default = 3) number of generator layers (after sample); size is the same as n_latent batch_size : int (default None) provided only at training time (or when model is Hybridized) - otherwise will be inferred ctx : context device (default is mx.cpu()) """ def __init__(self, vocabulary, enc_dim, n_latent, embedding_size, fixed_embedding=False, latent_distrib='logistic_gaussian', init_l1=0.0, coherence_reg_penalty=0.0, kappa=100.0, target_sparsity=0.0, batch_size=None, wd_freqs=None, seed_mat=None, n_covars=0, ctx=mx.cpu()): super(BowNTM, self).__init__() self.batch_size = batch_size self._orig_batch_size = batch_size self.n_latent = n_latent self.model_ctx = ctx self.vocab_size = len(vocabulary) self.coherence_reg_penalty = coherence_reg_penalty self.embedding_size = embedding_size self.target_sparsity = target_sparsity self.vocabulary = vocabulary if vocabulary.embedding: assert vocabulary.embedding.idx_to_vec[0].size == embedding_size with self.name_scope(): self.l1_pen_const = self.params.get('l1_pen_const', shape=(1, ), init=mx.init.Constant( [init_l1]), differentiable=False) ## Add in topic seed constraints self.seed_matrix = seed_mat self.embedding = gluon.nn.Dense(in_units=self.vocab_size, units=self.embedding_size, activation='tanh') self.encoder = gluon.nn.Dense( in_units=(self.embedding_size + n_covars), units=enc_dim, activation='softrelu') ## just single FC layer 'encoder' if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution( n_latent, ctx) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution( n_latent, ctx) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.vocab_size, activation=None) self.coherence_regularization = CoherenceRegularizer( coherence_reg_penalty) self.initialize(mx.init.Xavier(), ctx=self.model_ctx) if vocabulary.embedding: emb = vocabulary.embedding.idx_to_vec.transpose() emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10 emb_norm = emb / emb_norm_val self.embedding.weight.set_data(emb_norm) if fixed_embedding: self.embedding.collect_params().setattr('grad_req', 'null') ## Initialize and FIX decoder bias terms to corpus frequencies if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data() def encode_data(self, data): """ Encode data to the mean of the latent distribution defined by the input `data` """ return self.latent_dist.mu_encoder(self.encoder(self.embedding(data))) def get_l1_penalty_term(self, F, l1_pen_const, batch_size): if F is mx.ndarray: dec_weights = self.decoder.params.get('weight').data() else: dec_weights = self.decoder.params.get('weight').var() return l1_pen_const * F.sum(F.abs(dec_weights)) def add_coherence_reg_penalty(self, F, cur_loss): if self.coherence_reg_penalty > 0.0: if F is mx.ndarray: w = self.decoder.params.get('weight').data() emb = self.embedding.params.get('weight').data() else: w = self.decoder.params.get('weight').var() emb = self.embedding.params.get('weight').var() c = (self.coherence_regularization(w, emb) * self.coherence_reg_penalty) return (cur_loss + c), c else: #return (cur_loss, None) return (cur_loss, F.zeros_like(cur_loss)) def add_seed_constraint_loss(self, F, cur_loss): # G - number of seeded topics # S - number of seeds per topic # K - number of topics if self.seed_matrix is not None: if F is mx.ndarray: w = self.decoder.params.get('weight').data() else: w = self.decoder.params.get('weight').var() ts = F.take(w, self.seed_matrix) ## should have shape (G, S, K) ts_sums = F.sum(ts, axis=1) # now (G, K) ts_probs = F.softmax(ts_sums, axis=1) entropies = -F.sum( ts_probs * F.log(ts_probs)) ## want to minimize the entropy here ## Ensure seed terms have higher weights seed_means = F.mean(ts, axis=1) # (G,K) total_means = F.mean(w, axis=0) # (K,) pref_loss = F.relu( total_means - seed_means ) # penalty if mean weight for topic is greater than seed means # minimize weighted entropy over the seed means seed_pr = F.softmax(seed_means) per_topic_entropy = -F.sum(seed_pr * F.log(seed_pr), axis=0) seed_means_pr = F.sum(seed_pr, axis=0) per_topic_entropy = F.sum(seed_means_pr * per_topic_entropy) entropies = F.add(entropies, F.sum(pref_loss)) entropies = F.add(entropies, per_topic_entropy) return (F.broadcast_add(cur_loss, entropies), entropies) else: return (cur_loss, F.zeros_like(cur_loss)) #return (cur_loss, None) def general_entropy_min_loss(self, F, cur_loss): if F is mx.ndarray: w = self.decoder.params.get('weight').data() else: w = self.decoder.params.get('weight').var() #print("Shape w = {}".format(w.shape)) w_term_probs = F.softmax(w, axis=1)**4.0 #w_topic_probs = F.softmax(w, axis=0) ** 2.0 #print("Term 1 = {}".format(w_term_probs[0].asnumpy())) entropies = -F.sum(w_term_probs * F.log(w_term_probs)) #entropies = -F.sum(w_topic_probs * F.log(w_topic_probs)) #entropies_term = -F.sum(w_term_probs * F.log(w_term_probs), axis=1) #print("Shape entropies = {}".format(entropies_term.shape)) #print("Entropies term = {}".format(entropies_term[:20].asnumpy())) return (F.broadcast_add(cur_loss, entropies), entropies) def run_encode(self, F, in_data, batch_size): enc_out = self.encoder(in_data) #z_do = self.post_sample_dr_o(z) return self.latent_dist(enc_out, batch_size) def get_loss_terms(self, F, data, y, KL, l1_pen_const, batch_size): l1_pen = self.get_l1_penalty_term(F, l1_pen_const, batch_size) recon_loss = -F.sparse.sum( data * F.log(y + 1e-12), axis=0, exclude=True) i_loss = F.broadcast_plus(recon_loss, F.broadcast_plus(l1_pen, KL)) ii_loss, coherence_loss = self.add_coherence_reg_penalty(F, i_loss) iii_loss, entropies = self.add_seed_constraint_loss(F, ii_loss) #iv_loss, entropies = self.general_entropy_min_loss(F, iii_loss) return iii_loss, recon_loss, l1_pen, entropies, coherence_loss def hybrid_forward(self, F, data, l1_pen_const=None): batch_size = data.shape[0] if F is mx.ndarray else self.batch_size emb_out = self.embedding(data) z, KL = self.run_encode(F, emb_out, batch_size) dec_out = self.decoder(z) y = F.softmax(dec_out, axis=1) iii_loss, recon_loss, l1_pen, entropies, coherence_loss = self.get_loss_terms( F, data, y, KL, l1_pen_const, batch_size) return iii_loss, KL, recon_loss, l1_pen, entropies, coherence_loss, y
def __init__(self, bow_vocab_size, vocabulary, emb_dim, latent_distrib='vmf', num_units=512, hidden_size=512, num_heads=4, n_latent=256, max_sent_len=32, transformer_layers=2, kappa=100.0, batch_size=16, kld=0.1, wd_freqs=None, ctx=mx.cpu(), prefix=None, params=None): super(TransformerBowVEDTest, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.vocabulary = vocabulary self.batch_size = batch_size self.wd_embed_dim = emb_dim self.vocab_size = len(vocabulary.idx_to_token) self.bow_vocab_size = bow_vocab_size self.latent_distrib = latent_distrib self.num_units = num_units self.hidden_size = hidden_size self.num_heads = num_heads self.transformer_layers = transformer_layers self.kappa = kappa with self.name_scope(): if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0, var=0.05) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.embedding = nn.Dense(in_units=self.bow_vocab_size, units=self.wd_embed_dim, activation='tanh') self.encoder = nn.Dense(in_units=self.wd_embed_dim, units=200, activation='softrelu') #self.encoder = TransformerEncoder(self.wd_embed_dim, self.num_units, hidden_size=hidden_size, num_heads=num_heads, # n_layers=transformer_layers, n_latent=n_latent, sent_size = max_sent_len, # batch_size = batch_size, ctx = ctx) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.bow_vocab_size, activation=None) self.initialize(mx.init.Xavier(), ctx=self.model_ctx) if self.vocabulary.embedding is not None: emb = vocabulary.embedding.idx_to_vec emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=1) + 1e-10 emb_norm = emb / emb_norm_val self.embedding.weight.set_data(emb_norm) if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data()
class TransformerBowVED(Block): def __init__(self, bow_vocab_size, vocabulary, emb_dim, latent_distrib='vmf', num_units=512, hidden_size=512, num_heads=4, n_latent=256, max_sent_len=32, transformer_layers=2, kappa=100.0, batch_size=16, kld=0.1, wd_freqs=None, ctx=mx.cpu(), prefix=None, params=None): super(TransformerBowVED, self).__init__(prefix=prefix, params=params) self.kld_wt = kld self.n_latent = n_latent self.model_ctx = ctx self.max_sent_len = max_sent_len self.vocabulary = vocabulary self.batch_size = batch_size self.wd_embed_dim = emb_dim self.vocab_size = len(vocabulary.idx_to_token) self.bow_vocab_size = bow_vocab_size self.latent_distrib = latent_distrib self.num_units = num_units self.hidden_size = hidden_size self.num_heads = num_heads self.transformer_layers = transformer_layers self.kappa = kappa with self.name_scope(): if latent_distrib == 'logistic_gaussian': self.latent_dist = LogisticGaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'vmf': self.latent_dist = HyperSphericalLatentDistribution( n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0) elif latent_distrib == 'gaussian': self.latent_dist = GaussianLatentDistribution(n_latent, ctx, dr=0.0) elif latent_distrib == 'gaussian_unitvar': self.latent_dist = GaussianUnitVarLatentDistribution(n_latent, ctx, dr=0.0, var=0.05) else: raise Exception( "Invalid distribution ==> {}".format(latent_distrib)) self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim) self.encoder = TransformerEncoder(self.wd_embed_dim, self.num_units, hidden_size=hidden_size, num_heads=num_heads, n_layers=transformer_layers, n_latent=n_latent, sent_size=max_sent_len, batch_size=batch_size, ctx=ctx) self.decoder = gluon.nn.Dense(in_units=n_latent, units=self.bow_vocab_size, activation=None) self.initialize(mx.init.Xavier(), ctx=self.model_ctx) if self.vocabulary.embedding is not None: emb = vocabulary.embedding.idx_to_vec emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=1) + 1e-10 emb_norm = emb / emb_norm_val self.embedding.weight.set_data(emb_norm) if wd_freqs is not None: freq_nd = wd_freqs + 1 total = freq_nd.sum() log_freq = freq_nd.log() - freq_nd.sum().log() bias_param = self.decoder.collect_params().get('bias') bias_param.set_data(log_freq) bias_param.grad_req = 'null' self.out_bias = bias_param.data() def get_top_k_terms(self, k): """ Returns the top K terms for each topic based on sensitivity analysis. Terms whose probability increases the most for a unit increase in a given topic score/probability are those most associated with the topic. This is just the topic-term weights for a linear decoder - but code here will work with arbitrary decoder. """ z = mx.nd.ones(shape=(1, self.n_latent), ctx=self.model_ctx) jacobian = mx.nd.zeros(shape=(self.bow_vocab_size, self.n_latent), ctx=self.model_ctx) z.attach_grad() for i in range(self.bow_vocab_size): with mx.autograd.record(): y = self.decoder(z) yi = y[0][i] yi.backward() jacobian[i] = z.grad sorted_j = jacobian.argsort(axis=0, is_ascend=False) return sorted_j def __call__(self, wp_toks, bow): return super(TransformerBowVED, self).__call__(wp_toks, bow) def set_kl_weight(self, epoch, max_epochs): burn_in = int(max_epochs / 10) eps = 1e-6 if epoch > burn_in: self.kld_wt = ((epoch - burn_in) / (max_epochs - burn_in)) + eps else: self.kld_wt = eps return self.kld_wt def encode(self, toks): embedded = self.embedding(toks) enc = self.encoder(embedded) return self.latent_dist.mu_encoder(enc) def forward(self, toks, bow): embedded = self.embedding(toks) enc = self.encoder(embedded) z, KL = self.latent_dist(enc, self.batch_size) y = self.decoder(z) y = mx.nd.softmax(y, axis=1) rr = bow * mx.nd.log(y + 1e-12) recon_loss = -mx.nd.sparse.sum(rr, axis=1) KL_loss = (KL * self.kld_wt) loss = recon_loss + KL_loss return loss, recon_loss, KL_loss, y