Exemplo n.º 1
0
 def __init__(self,
              bert_base,
              bow_vocab_size,
              latent_distrib='vmf',
              n_latent=256,
              max_sent_len=32,
              kappa=100.0,
              batch_size=16,
              kld=0.1,
              wd_freqs=None,
              ctx=mx.cpu(),
              prefix=None,
              params=None):
     super(BertBowVED, self).__init__(prefix=prefix, params=params)
     self.kld_wt = kld
     self.n_latent = n_latent
     self.model_ctx = ctx
     self.max_sent_len = max_sent_len
     self.batch_size = batch_size
     self.bow_vocab_size = bow_vocab_size
     self.latent_distrib = latent_distrib
     self.kappa = kappa
     with self.name_scope():
         self.encoder = bert_base
         if latent_distrib == 'logistic_gaussian':
             self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                   ctx,
                                                                   dr=0.0)
         elif latent_distrib == 'vmf':
             self.latent_dist = HyperSphericalLatentDistribution(
                 n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0)
         elif latent_distrib == 'gaussian':
             self.latent_dist = GaussianLatentDistribution(n_latent,
                                                           ctx,
                                                           dr=0.0)
         elif latent_distrib == 'gaussian_unitvar':
             self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                  ctx,
                                                                  dr=0.0,
                                                                  var=0.05)
         else:
             raise Exception(
                 "Invalid distribution ==> {}".format(latent_distrib))
         self.decoder = gluon.nn.Dense(in_units=n_latent,
                                       units=self.bow_vocab_size,
                                       activation=None)
     self.latent_dist.initialize(mx.init.Xavier(), ctx=self.model_ctx)
     self.decoder.initialize(mx.init.Xavier(), ctx=self.model_ctx)
     if wd_freqs is not None:
         freq_nd = wd_freqs + 1
         total = freq_nd.sum()
         log_freq = freq_nd.log() - freq_nd.sum().log()
         bias_param = self.decoder.collect_params().get('bias')
         bias_param.set_data(log_freq)
         bias_param.grad_req = 'null'
         self.out_bias = bias_param.data()
Exemplo n.º 2
0
 def __init__(self,
              vocabulary,
              enc_dim,
              n_latent,
              embedding_size,
              fixed_embedding=False,
              latent_distrib='logistic_gaussian',
              init_l1=0.0,
              coherence_reg_penalty=0.0,
              redundancy_reg_penalty=0.0,
              kappa=100.0,
              alpha=1.0,
              target_sparsity=0.0,
              batch_size=None,
              n_encoding_layers=1,
              enc_dr=0.1,
              wd_freqs=None,
              seed_mat=None,
              n_covars=0,
              ctx=mx.cpu()):
     super(BowNTM, self).__init__()
     self.batch_size = batch_size
     self._orig_batch_size = batch_size
     self.n_latent = n_latent
     self.model_ctx = ctx
     self.vocab_size = len(vocabulary)
     self.coherence_reg_penalty = coherence_reg_penalty
     self.redundancy_reg_penalty = redundancy_reg_penalty
     self.embedding_size = embedding_size
     self.target_sparsity = target_sparsity
     self.vocabulary = vocabulary
     self.num_enc_layers = n_encoding_layers
     if vocabulary.embedding:
         assert vocabulary.embedding.idx_to_vec[0].size == embedding_size
     self.encoding_dims = [self.embedding_size + n_covars
                           ] + [enc_dim for _ in range(n_encoding_layers)]
     with self.name_scope():
         self.l1_pen_const = self.params.get('l1_pen_const',
                                             shape=(1, ),
                                             init=mx.init.Constant(
                                                 [init_l1]),
                                             differentiable=False)
         ## Add in topic seed constraints
         self.seed_matrix = seed_mat
         ## should be tanh here to avoid losing embedding information
         self.embedding = gluon.nn.Dense(in_units=self.vocab_size,
                                         units=self.embedding_size,
                                         activation='tanh')
         self.encoder = self._get_encoder(self.encoding_dims, dr=enc_dr)
         #self.encoder = gluon.nn.Dense(in_units=(self.embedding_size + n_covars),
         #                              units = enc_dim, activation='softrelu') ## just single FC layer 'encoder'
         if latent_distrib == 'logistic_gaussian':
             self.latent_dist = LogisticGaussianLatentDistribution(
                 n_latent, ctx, alpha=alpha)
         elif latent_distrib == 'vmf':
             self.latent_dist = HyperSphericalLatentDistribution(
                 n_latent, kappa=kappa, ctx=self.model_ctx)
         elif latent_distrib == 'gaussian':
             self.latent_dist = GaussianLatentDistribution(n_latent, ctx)
         elif latent_distrib == 'gaussian_unitvar':
             self.latent_dist = GaussianUnitVarLatentDistribution(
                 n_latent, ctx)
         else:
             raise Exception(
                 "Invalid distribution ==> {}".format(latent_distrib))
         self.decoder = gluon.nn.Dense(in_units=n_latent,
                                       units=self.vocab_size,
                                       activation=None)
         self.coherence_regularization = CoherenceRegularizer(
             self.coherence_reg_penalty, self.redundancy_reg_penalty)
     self.initialize(mx.init.Xavier(), ctx=self.model_ctx)
     if vocabulary.embedding:
         emb = vocabulary.embedding.idx_to_vec.transpose()
         emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10
         emb_norm = emb / emb_norm_val
         self.embedding.weight.set_data(emb_norm)
         if fixed_embedding:
             self.embedding.collect_params().setattr('grad_req', 'null')
     ## Initialize and FIX decoder bias terms to corpus frequencies
     if wd_freqs is not None:
         freq_nd = wd_freqs + 1
         total = freq_nd.sum()
         log_freq = freq_nd.log() - freq_nd.sum().log()
         bias_param = self.decoder.collect_params().get('bias')
         bias_param.set_data(log_freq)
         bias_param.grad_req = 'null'
         self.out_bias = bias_param.data()
Exemplo n.º 3
0
class BowNTM(HybridBlock):
    """
    Parameters
    ----------
    vocabulary : int size of the vocabulary
    enc_dim : int number of dimension of input encoder (first FC layer)
    n_latent : int number of dimensions of the latent dimension (i.e. number of topics)
    gen_layers : int (default = 3) number of generator layers (after sample); size is the same as n_latent
    batch_size : int (default None) provided only at training time (or when model is Hybridized) - otherwise will be inferred
    ctx : context device (default is mx.cpu())
    """
    def __init__(self,
                 vocabulary,
                 enc_dim,
                 n_latent,
                 embedding_size,
                 fixed_embedding=False,
                 latent_distrib='logistic_gaussian',
                 init_l1=0.0,
                 coherence_reg_penalty=0.0,
                 redundancy_reg_penalty=0.0,
                 kappa=100.0,
                 alpha=1.0,
                 target_sparsity=0.0,
                 batch_size=None,
                 n_encoding_layers=1,
                 enc_dr=0.1,
                 wd_freqs=None,
                 seed_mat=None,
                 n_covars=0,
                 ctx=mx.cpu()):
        super(BowNTM, self).__init__()
        self.batch_size = batch_size
        self._orig_batch_size = batch_size
        self.n_latent = n_latent
        self.model_ctx = ctx
        self.vocab_size = len(vocabulary)
        self.coherence_reg_penalty = coherence_reg_penalty
        self.redundancy_reg_penalty = redundancy_reg_penalty
        self.embedding_size = embedding_size
        self.target_sparsity = target_sparsity
        self.vocabulary = vocabulary
        self.num_enc_layers = n_encoding_layers
        if vocabulary.embedding:
            assert vocabulary.embedding.idx_to_vec[0].size == embedding_size
        self.encoding_dims = [self.embedding_size + n_covars
                              ] + [enc_dim for _ in range(n_encoding_layers)]
        with self.name_scope():
            self.l1_pen_const = self.params.get('l1_pen_const',
                                                shape=(1, ),
                                                init=mx.init.Constant(
                                                    [init_l1]),
                                                differentiable=False)
            ## Add in topic seed constraints
            self.seed_matrix = seed_mat
            ## should be tanh here to avoid losing embedding information
            self.embedding = gluon.nn.Dense(in_units=self.vocab_size,
                                            units=self.embedding_size,
                                            activation='tanh')
            self.encoder = self._get_encoder(self.encoding_dims, dr=enc_dr)
            #self.encoder = gluon.nn.Dense(in_units=(self.embedding_size + n_covars),
            #                              units = enc_dim, activation='softrelu') ## just single FC layer 'encoder'
            if latent_distrib == 'logistic_gaussian':
                self.latent_dist = LogisticGaussianLatentDistribution(
                    n_latent, ctx, alpha=alpha)
            elif latent_distrib == 'vmf':
                self.latent_dist = HyperSphericalLatentDistribution(
                    n_latent, kappa=kappa, ctx=self.model_ctx)
            elif latent_distrib == 'gaussian':
                self.latent_dist = GaussianLatentDistribution(n_latent, ctx)
            elif latent_distrib == 'gaussian_unitvar':
                self.latent_dist = GaussianUnitVarLatentDistribution(
                    n_latent, ctx)
            else:
                raise Exception(
                    "Invalid distribution ==> {}".format(latent_distrib))
            self.decoder = gluon.nn.Dense(in_units=n_latent,
                                          units=self.vocab_size,
                                          activation=None)
            self.coherence_regularization = CoherenceRegularizer(
                self.coherence_reg_penalty, self.redundancy_reg_penalty)
        self.initialize(mx.init.Xavier(), ctx=self.model_ctx)
        if vocabulary.embedding:
            emb = vocabulary.embedding.idx_to_vec.transpose()
            emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10
            emb_norm = emb / emb_norm_val
            self.embedding.weight.set_data(emb_norm)
            if fixed_embedding:
                self.embedding.collect_params().setattr('grad_req', 'null')
        ## Initialize and FIX decoder bias terms to corpus frequencies
        if wd_freqs is not None:
            freq_nd = wd_freqs + 1
            total = freq_nd.sum()
            log_freq = freq_nd.log() - freq_nd.sum().log()
            bias_param = self.decoder.collect_params().get('bias')
            bias_param.set_data(log_freq)
            bias_param.grad_req = 'null'
            self.out_bias = bias_param.data()

    def _get_encoder(self, dims, dr=0.1):
        encoder = gluon.nn.HybridSequential()
        for i in range(len(dims) - 1):
            encoder.add(
                gluon.nn.Dense(in_units=dims[i],
                               units=dims[i + 1],
                               activation='softrelu'))
            if dr > 0.0:
                encoder.add(gluon.nn.Dropout(dr))
        return encoder

    def get_top_k_terms(self, k):
        """
        Returns the top K terms for each topic based on sensitivity analysis. Terms whose 
        probability increases the most for a unit increase in a given topic score/probability
        are those most associated with the topic.
        """
        z = mx.nd.ones(shape=(1, self.n_latent), ctx=self.model_ctx)
        jacobian = mx.nd.zeros(shape=(self.vocab_size, self.n_latent),
                               ctx=self.model_ctx)
        z.attach_grad()
        for i in range(self.vocab_size):
            with mx.autograd.record():
                y = self.decoder(z)
                yi = y[0][i]
            yi.backward()
            jacobian[i] = z.grad
        sorted_j = jacobian.argsort(axis=0, is_ascend=False)
        return sorted_j

    def encode_data(self, data):
        """
        Encode data to the mean of the latent distribution defined by the input `data`
        """
        return self.latent_dist.mu_encoder(self.encoder(self.embedding(data)))

    def get_l1_penalty_term(self, F, l1_pen_const, batch_size):
        if F is mx.ndarray:
            dec_weights = self.decoder.params.get('weight').data()
        else:
            dec_weights = self.decoder.params.get('weight').var()
        return l1_pen_const * F.sum(F.abs(dec_weights))

    def add_coherence_reg_penalty(self, F, cur_loss):
        if self.coherence_reg_penalty > 0.0:
            if F is mx.ndarray:
                w = self.decoder.params.get('weight').data()
                emb = self.embedding.params.get('weight').data()
            else:
                w = self.decoder.params.get('weight').var()
                emb = self.embedding.params.get('weight').var()
            c, d = self.coherence_regularization(w, emb)
            return (cur_loss + c + d), c, d
        else:
            return (cur_loss, F.zeros_like(cur_loss), F.zeros_like(cur_loss))

    def add_seed_constraint_loss(self, F, cur_loss):
        # G - number of seeded topics
        # S - number of seeds per topic
        # K - number of topics
        if self.seed_matrix is not None:
            if F is mx.ndarray:
                w = self.decoder.params.get('weight').data()
            else:
                w = self.decoder.params.get('weight').var()
            ts = F.take(w, self.seed_matrix)  ## should have shape (G, S, K)
            ts_sums = F.sum(ts, axis=1)  # now (G, K)
            ts_probs = F.softmax(ts_sums, axis=1)
            entropies = -F.sum(
                ts_probs *
                F.log(ts_probs))  ## want to minimize the entropy here
            ## Ensure seed terms have higher weights
            seed_means = F.mean(ts, axis=1)  # (G,K)
            total_means = F.mean(w, axis=0)  # (K,)
            pref_loss = F.relu(
                total_means - seed_means
            )  # penalty if mean weight for topic is greater than seed means
            # minimize weighted entropy over the seed means
            seed_pr = F.softmax(seed_means)
            per_topic_entropy = -F.sum(seed_pr * F.log(seed_pr), axis=0)
            seed_means_pr = F.sum(seed_pr, axis=0)
            per_topic_entropy = F.sum(seed_means_pr * per_topic_entropy)
            entropies = F.add(entropies, F.sum(pref_loss))
            entropies = F.add(entropies, per_topic_entropy)
            return (F.broadcast_add(cur_loss, entropies), entropies)
        else:
            return (cur_loss, F.zeros_like(cur_loss))

    def run_encode(self, F, in_data, batch_size):
        enc_out = self.encoder(in_data)
        return self.latent_dist(enc_out, batch_size)

    def get_loss_terms(self, F, data, y, KL, l1_pen_const, batch_size):
        l1_pen = self.get_l1_penalty_term(F, l1_pen_const, batch_size)
        rr = data * F.log(y + 1e-12)
        recon_loss = -F.sparse.sum(rr, axis=1)
        i_loss = F.broadcast_plus(recon_loss, F.broadcast_plus(l1_pen, KL))
        ii_loss, coherence_loss, redundancy_loss = self.add_coherence_reg_penalty(
            F, i_loss)
        iii_loss, entropies = self.add_seed_constraint_loss(F, ii_loss)
        return iii_loss, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss

    def hybrid_forward(self, F, data, l1_pen_const=None):
        batch_size = data.shape[0] if F is mx.ndarray else self.batch_size
        emb_out = self.embedding(data)
        z, KL = self.run_encode(F, emb_out, batch_size)
        dec_out = self.decoder(z)
        y = F.softmax(dec_out, axis=1)
        iii_loss, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss = \
            self.get_loss_terms(F, data, y, KL, l1_pen_const, batch_size)
        return iii_loss, KL, recon_loss, l1_pen, entropies, coherence_loss, redundancy_loss, y
Exemplo n.º 4
0
 def __init__(self,
              vocabulary,
              emb_dim,
              latent_distrib='vmf',
              num_units=512,
              hidden_size=512,
              num_heads=4,
              n_latent=256,
              max_sent_len=64,
              transformer_layers=6,
              label_smoothing_epsilon=0.0,
              kappa=100.0,
              batch_size=16,
              kld=0.1,
              wd_temp=0.01,
              ctx=mx.cpu(),
              prefix=None,
              params=None):
     super(PureTransformerVAE, self).__init__(prefix=prefix, params=params)
     self.kld_wt = kld
     self.n_latent = n_latent
     self.model_ctx = ctx
     self.max_sent_len = max_sent_len
     self.vocabulary = vocabulary
     self.batch_size = batch_size
     self.wd_embed_dim = emb_dim
     self.vocab_size = len(vocabulary.idx_to_token)
     self.latent_distrib = latent_distrib
     self.num_units = num_units
     self.hidden_size = hidden_size
     self.num_heads = num_heads
     self.transformer_layers = transformer_layers
     self.label_smoothing_epsilon = label_smoothing_epsilon
     self.kappa = kappa
     with self.name_scope():
         if latent_distrib == 'logistic_gaussian':
             self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                   ctx,
                                                                   dr=0.0)
         elif latent_distrib == 'vmf':
             self.latent_dist = HyperSphericalLatentDistribution(
                 n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0)
         elif latent_distrib == 'gaussian':
             self.latent_dist = GaussianLatentDistribution(n_latent,
                                                           ctx,
                                                           dr=0.0)
         elif latent_distrib == 'gaussian_unitvar':
             self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                  ctx,
                                                                  dr=0.0,
                                                                  var=0.05)
         else:
             raise Exception(
                 "Invalid distribution ==> {}".format(latent_distrib))
         self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim)
         self.encoder = TransformerEncoder(self.wd_embed_dim,
                                           self.num_units,
                                           hidden_size=hidden_size,
                                           num_heads=num_heads,
                                           n_layers=transformer_layers,
                                           n_latent=n_latent,
                                           sent_size=max_sent_len,
                                           batch_size=batch_size,
                                           ctx=ctx)
         self.decoder = TransformerDecoder(wd_embed_dim=self.wd_embed_dim,
                                           num_units=self.num_units,
                                           hidden_size=hidden_size,
                                           num_heads=num_heads,
                                           n_layers=transformer_layers,
                                           n_latent=n_latent,
                                           sent_size=max_sent_len,
                                           batch_size=batch_size,
                                           ctx=ctx)
         #self.out_embedding = gluon.nn.Embedding(input_dim=self.vocab_size, output_dim=self.wd_embed_dim)
         self.inv_embed = InverseEmbed(batch_size,
                                       max_sent_len,
                                       self.wd_embed_dim,
                                       temp=wd_temp,
                                       ctx=self.model_ctx,
                                       params=self.embedding.params)
         self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss(
             axis=-1, from_logits=True, sparse_label=True)
         #self.label_smoothing = LabelSmoothing(epsilon=label_smoothing_epsilon, units=self.vocab_size)
     self.embedding.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx)
     #self.out_embedding.initialize(mx.init.Uniform(0.1), ctx=ctx)
     #self.inv_embed.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx)
     if self.vocabulary.embedding:
         #self.out_embedding.weight.set_data(self.vocabulary.embedding.idx_to_vec)
         self.embedding.weight.set_data(
             self.vocabulary.embedding.idx_to_vec)
Exemplo n.º 5
0
class PureTransformerVAE(Block):
    def __init__(self,
                 vocabulary,
                 emb_dim,
                 latent_distrib='vmf',
                 num_units=512,
                 hidden_size=512,
                 num_heads=4,
                 n_latent=256,
                 max_sent_len=64,
                 transformer_layers=6,
                 label_smoothing_epsilon=0.0,
                 kappa=100.0,
                 batch_size=16,
                 kld=0.1,
                 wd_temp=0.01,
                 ctx=mx.cpu(),
                 prefix=None,
                 params=None):
        super(PureTransformerVAE, self).__init__(prefix=prefix, params=params)
        self.kld_wt = kld
        self.n_latent = n_latent
        self.model_ctx = ctx
        self.max_sent_len = max_sent_len
        self.vocabulary = vocabulary
        self.batch_size = batch_size
        self.wd_embed_dim = emb_dim
        self.vocab_size = len(vocabulary.idx_to_token)
        self.latent_distrib = latent_distrib
        self.num_units = num_units
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.transformer_layers = transformer_layers
        self.label_smoothing_epsilon = label_smoothing_epsilon
        self.kappa = kappa
        with self.name_scope():
            if latent_distrib == 'logistic_gaussian':
                self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                      ctx,
                                                                      dr=0.0)
            elif latent_distrib == 'vmf':
                self.latent_dist = HyperSphericalLatentDistribution(
                    n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0)
            elif latent_distrib == 'gaussian':
                self.latent_dist = GaussianLatentDistribution(n_latent,
                                                              ctx,
                                                              dr=0.0)
            elif latent_distrib == 'gaussian_unitvar':
                self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                     ctx,
                                                                     dr=0.0,
                                                                     var=0.05)
            else:
                raise Exception(
                    "Invalid distribution ==> {}".format(latent_distrib))
            self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim)
            self.encoder = TransformerEncoder(self.wd_embed_dim,
                                              self.num_units,
                                              hidden_size=hidden_size,
                                              num_heads=num_heads,
                                              n_layers=transformer_layers,
                                              n_latent=n_latent,
                                              sent_size=max_sent_len,
                                              batch_size=batch_size,
                                              ctx=ctx)
            self.decoder = TransformerDecoder(wd_embed_dim=self.wd_embed_dim,
                                              num_units=self.num_units,
                                              hidden_size=hidden_size,
                                              num_heads=num_heads,
                                              n_layers=transformer_layers,
                                              n_latent=n_latent,
                                              sent_size=max_sent_len,
                                              batch_size=batch_size,
                                              ctx=ctx)
            #self.out_embedding = gluon.nn.Embedding(input_dim=self.vocab_size, output_dim=self.wd_embed_dim)
            self.inv_embed = InverseEmbed(batch_size,
                                          max_sent_len,
                                          self.wd_embed_dim,
                                          temp=wd_temp,
                                          ctx=self.model_ctx,
                                          params=self.embedding.params)
            self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss(
                axis=-1, from_logits=True, sparse_label=True)
            #self.label_smoothing = LabelSmoothing(epsilon=label_smoothing_epsilon, units=self.vocab_size)
        self.embedding.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx)
        #self.out_embedding.initialize(mx.init.Uniform(0.1), ctx=ctx)
        #self.inv_embed.initialize(mx.init.Xavier(magnitude=2.34), ctx=ctx)
        if self.vocabulary.embedding:
            #self.out_embedding.weight.set_data(self.vocabulary.embedding.idx_to_vec)
            self.embedding.weight.set_data(
                self.vocabulary.embedding.idx_to_vec)

    def __call__(self, wp_toks):
        return super(PureTransformerVAE, self).__call__(wp_toks)

    def set_kl_weight(self, epoch, max_epochs):
        burn_in = int(max_epochs / 10)
        eps = 1e-6
        if epoch > burn_in:
            self.kld_wt = ((epoch - burn_in) / (max_epochs - burn_in)) + eps
        else:
            self.kld_wt = eps
        return self.kld_wt

    def encode(self, toks):
        embedded = self.embedding(toks)
        enc = self.encoder(embedded)
        return self.latent_dist.mu_encoder(enc)

    def forward(self, toks):
        embedded = self.embedding(toks)
        enc = self.encoder(embedded)
        z, KL = self.latent_dist(enc, self.batch_size)
        y = self.decoder(z)
        prob_logits = self.inv_embed(y)
        log_prob = mx.nd.log_softmax(prob_logits)
        recon_loss = self.ce_loss_fn(log_prob, toks)
        kl_loss = (KL * self.kld_wt)
        loss = recon_loss + kl_loss
        return loss, recon_loss, kl_loss, log_prob
Exemplo n.º 6
0
 def __init__(self,
              bert_base,
              latent_distrib='vmf',
              wd_embed_dim=300,
              num_units=512,
              n_latent=256,
              max_sent_len=64,
              transformer_layers=6,
              kappa=100.0,
              batch_size=16,
              kld=0.1,
              wd_temp=0.01,
              ctx=mx.cpu(),
              increasing=True,
              decreasing=False,
              prefix=None,
              params=None):
     super(BertTransVAE, self).__init__(prefix=prefix, params=params)
     self.kld_wt = kld
     self.bert = bert_base
     self.n_latent = n_latent
     self.model_ctx = ctx
     self.max_sent_len = max_sent_len
     self.batch_size = batch_size
     self.wd_embed_dim = wd_embed_dim
     self.latent_distrib = latent_distrib
     with self.name_scope():
         if latent_distrib == 'logistic_gaussian':
             self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                   ctx,
                                                                   dr=0.0)
         elif latent_distrib == 'vmf':
             self.latent_dist = HyperSphericalLatentDistribution(
                 n_latent, kappa=kappa, dr=0.0, ctx=self.model_ctx)
         elif latent_distrib == 'gaussian':
             self.latent_dist = GaussianLatentDistribution(n_latent,
                                                           ctx,
                                                           dr=0.0)
         elif latent_distrib == 'gaussian_unitvar':
             self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                  ctx,
                                                                  dr=0.0)
         else:
             raise Exception(
                 "Invalid distribution ==> {}".format(latent_distrib))
         self.decoder = TransformerDecoder(wd_embed_dim=wd_embed_dim,
                                           num_units=num_units,
                                           n_layers=transformer_layers,
                                           n_latent=n_latent,
                                           sent_size=max_sent_len,
                                           batch_size=batch_size,
                                           ctx=ctx)
         self.vocab_size = self.bert.word_embed[0].params.get(
             'weight').shape[0]
         self.out_embedding = gluon.nn.Embedding(
             input_dim=self.vocab_size,
             output_dim=wd_embed_dim,
             weight_initializer=mx.init.Uniform(0.1))
         self.inv_embed = InverseEmbed(batch_size,
                                       max_sent_len,
                                       self.wd_embed_dim,
                                       temp=wd_temp,
                                       ctx=self.model_ctx,
                                       params=self.out_embedding.params)
         self.ce_loss_fn = mx.gluon.loss.SoftmaxCrossEntropyLoss(
             axis=-1, from_logits=True)
Exemplo n.º 7
0
class BowNTM(HybridBlock):
    """
    Parameters
    ----------
    vocabulary : int size of the vocabulary
    enc_dim : int number of dimension of input encoder (first FC layer)
    n_latent : int number of dimensions of the latent dimension (i.e. number of topics)
    gen_layers : int (default = 3) number of generator layers (after sample); size is the same as n_latent
    batch_size : int (default None) provided only at training time (or when model is Hybridized) - otherwise will be inferred
    ctx : context device (default is mx.cpu())
    """
    def __init__(self,
                 vocabulary,
                 enc_dim,
                 n_latent,
                 embedding_size,
                 fixed_embedding=False,
                 latent_distrib='logistic_gaussian',
                 init_l1=0.0,
                 coherence_reg_penalty=0.0,
                 kappa=100.0,
                 target_sparsity=0.0,
                 batch_size=None,
                 wd_freqs=None,
                 seed_mat=None,
                 n_covars=0,
                 ctx=mx.cpu()):
        super(BowNTM, self).__init__()
        self.batch_size = batch_size
        self._orig_batch_size = batch_size
        self.n_latent = n_latent
        self.model_ctx = ctx
        self.vocab_size = len(vocabulary)
        self.coherence_reg_penalty = coherence_reg_penalty
        self.embedding_size = embedding_size
        self.target_sparsity = target_sparsity
        self.vocabulary = vocabulary
        if vocabulary.embedding:
            assert vocabulary.embedding.idx_to_vec[0].size == embedding_size
        with self.name_scope():
            self.l1_pen_const = self.params.get('l1_pen_const',
                                                shape=(1, ),
                                                init=mx.init.Constant(
                                                    [init_l1]),
                                                differentiable=False)
            ## Add in topic seed constraints
            self.seed_matrix = seed_mat
            self.embedding = gluon.nn.Dense(in_units=self.vocab_size,
                                            units=self.embedding_size,
                                            activation='tanh')
            self.encoder = gluon.nn.Dense(
                in_units=(self.embedding_size + n_covars),
                units=enc_dim,
                activation='softrelu')  ## just single FC layer 'encoder'
            if latent_distrib == 'logistic_gaussian':
                self.latent_dist = LogisticGaussianLatentDistribution(
                    n_latent, ctx)
            elif latent_distrib == 'vmf':
                self.latent_dist = HyperSphericalLatentDistribution(
                    n_latent, kappa=kappa, ctx=self.model_ctx)
            elif latent_distrib == 'gaussian':
                self.latent_dist = GaussianLatentDistribution(n_latent, ctx)
            elif latent_distrib == 'gaussian_unitvar':
                self.latent_dist = GaussianUnitVarLatentDistribution(
                    n_latent, ctx)
            else:
                raise Exception(
                    "Invalid distribution ==> {}".format(latent_distrib))
            self.decoder = gluon.nn.Dense(in_units=n_latent,
                                          units=self.vocab_size,
                                          activation=None)
            self.coherence_regularization = CoherenceRegularizer(
                coherence_reg_penalty)
        self.initialize(mx.init.Xavier(), ctx=self.model_ctx)
        if vocabulary.embedding:
            emb = vocabulary.embedding.idx_to_vec.transpose()
            emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=0) + 1e-10
            emb_norm = emb / emb_norm_val
            self.embedding.weight.set_data(emb_norm)
            if fixed_embedding:
                self.embedding.collect_params().setattr('grad_req', 'null')
        ## Initialize and FIX decoder bias terms to corpus frequencies
        if wd_freqs is not None:
            freq_nd = wd_freqs + 1
            total = freq_nd.sum()
            log_freq = freq_nd.log() - freq_nd.sum().log()
            bias_param = self.decoder.collect_params().get('bias')
            bias_param.set_data(log_freq)
            bias_param.grad_req = 'null'
            self.out_bias = bias_param.data()

    def encode_data(self, data):
        """
        Encode data to the mean of the latent distribution defined by the input `data`
        """
        return self.latent_dist.mu_encoder(self.encoder(self.embedding(data)))

    def get_l1_penalty_term(self, F, l1_pen_const, batch_size):
        if F is mx.ndarray:
            dec_weights = self.decoder.params.get('weight').data()
        else:
            dec_weights = self.decoder.params.get('weight').var()
        return l1_pen_const * F.sum(F.abs(dec_weights))

    def add_coherence_reg_penalty(self, F, cur_loss):
        if self.coherence_reg_penalty > 0.0:
            if F is mx.ndarray:
                w = self.decoder.params.get('weight').data()
                emb = self.embedding.params.get('weight').data()
            else:
                w = self.decoder.params.get('weight').var()
                emb = self.embedding.params.get('weight').var()
            c = (self.coherence_regularization(w, emb) *
                 self.coherence_reg_penalty)
            return (cur_loss + c), c
        else:
            #return (cur_loss, None)
            return (cur_loss, F.zeros_like(cur_loss))

    def add_seed_constraint_loss(self, F, cur_loss):
        # G - number of seeded topics
        # S - number of seeds per topic
        # K - number of topics
        if self.seed_matrix is not None:
            if F is mx.ndarray:
                w = self.decoder.params.get('weight').data()
            else:
                w = self.decoder.params.get('weight').var()
            ts = F.take(w, self.seed_matrix)  ## should have shape (G, S, K)
            ts_sums = F.sum(ts, axis=1)  # now (G, K)
            ts_probs = F.softmax(ts_sums, axis=1)
            entropies = -F.sum(
                ts_probs *
                F.log(ts_probs))  ## want to minimize the entropy here
            ## Ensure seed terms have higher weights
            seed_means = F.mean(ts, axis=1)  # (G,K)
            total_means = F.mean(w, axis=0)  # (K,)
            pref_loss = F.relu(
                total_means - seed_means
            )  # penalty if mean weight for topic is greater than seed means
            # minimize weighted entropy over the seed means
            seed_pr = F.softmax(seed_means)
            per_topic_entropy = -F.sum(seed_pr * F.log(seed_pr), axis=0)
            seed_means_pr = F.sum(seed_pr, axis=0)
            per_topic_entropy = F.sum(seed_means_pr * per_topic_entropy)
            entropies = F.add(entropies, F.sum(pref_loss))
            entropies = F.add(entropies, per_topic_entropy)
            return (F.broadcast_add(cur_loss, entropies), entropies)
        else:
            return (cur_loss, F.zeros_like(cur_loss))
            #return (cur_loss, None)

    def general_entropy_min_loss(self, F, cur_loss):
        if F is mx.ndarray:
            w = self.decoder.params.get('weight').data()
        else:
            w = self.decoder.params.get('weight').var()
        #print("Shape w = {}".format(w.shape))
        w_term_probs = F.softmax(w, axis=1)**4.0

        #w_topic_probs = F.softmax(w, axis=0) ** 2.0
        #print("Term 1 = {}".format(w_term_probs[0].asnumpy()))

        entropies = -F.sum(w_term_probs * F.log(w_term_probs))

        #entropies = -F.sum(w_topic_probs * F.log(w_topic_probs))
        #entropies_term = -F.sum(w_term_probs * F.log(w_term_probs), axis=1)
        #print("Shape entropies = {}".format(entropies_term.shape))
        #print("Entropies term = {}".format(entropies_term[:20].asnumpy()))
        return (F.broadcast_add(cur_loss, entropies), entropies)

    def run_encode(self, F, in_data, batch_size):
        enc_out = self.encoder(in_data)
        #z_do = self.post_sample_dr_o(z)
        return self.latent_dist(enc_out, batch_size)

    def get_loss_terms(self, F, data, y, KL, l1_pen_const, batch_size):
        l1_pen = self.get_l1_penalty_term(F, l1_pen_const, batch_size)
        recon_loss = -F.sparse.sum(
            data * F.log(y + 1e-12), axis=0, exclude=True)
        i_loss = F.broadcast_plus(recon_loss, F.broadcast_plus(l1_pen, KL))
        ii_loss, coherence_loss = self.add_coherence_reg_penalty(F, i_loss)
        iii_loss, entropies = self.add_seed_constraint_loss(F, ii_loss)
        #iv_loss, entropies = self.general_entropy_min_loss(F, iii_loss)
        return iii_loss, recon_loss, l1_pen, entropies, coherence_loss

    def hybrid_forward(self, F, data, l1_pen_const=None):
        batch_size = data.shape[0] if F is mx.ndarray else self.batch_size
        emb_out = self.embedding(data)
        z, KL = self.run_encode(F, emb_out, batch_size)
        dec_out = self.decoder(z)
        y = F.softmax(dec_out, axis=1)
        iii_loss, recon_loss, l1_pen, entropies, coherence_loss = self.get_loss_terms(
            F, data, y, KL, l1_pen_const, batch_size)
        return iii_loss, KL, recon_loss, l1_pen, entropies, coherence_loss, y
Exemplo n.º 8
0
 def __init__(self,
              bow_vocab_size,
              vocabulary,
              emb_dim,
              latent_distrib='vmf',
              num_units=512,
              hidden_size=512,
              num_heads=4,
              n_latent=256,
              max_sent_len=32,
              transformer_layers=2,
              kappa=100.0,
              batch_size=16,
              kld=0.1,
              wd_freqs=None,
              ctx=mx.cpu(),
              prefix=None,
              params=None):
     super(TransformerBowVEDTest, self).__init__(prefix=prefix,
                                                 params=params)
     self.kld_wt = kld
     self.n_latent = n_latent
     self.model_ctx = ctx
     self.max_sent_len = max_sent_len
     self.vocabulary = vocabulary
     self.batch_size = batch_size
     self.wd_embed_dim = emb_dim
     self.vocab_size = len(vocabulary.idx_to_token)
     self.bow_vocab_size = bow_vocab_size
     self.latent_distrib = latent_distrib
     self.num_units = num_units
     self.hidden_size = hidden_size
     self.num_heads = num_heads
     self.transformer_layers = transformer_layers
     self.kappa = kappa
     with self.name_scope():
         if latent_distrib == 'logistic_gaussian':
             self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                   ctx,
                                                                   dr=0.0)
         elif latent_distrib == 'vmf':
             self.latent_dist = HyperSphericalLatentDistribution(
                 n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0)
         elif latent_distrib == 'gaussian':
             self.latent_dist = GaussianLatentDistribution(n_latent,
                                                           ctx,
                                                           dr=0.0)
         elif latent_distrib == 'gaussian_unitvar':
             self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                  ctx,
                                                                  dr=0.0,
                                                                  var=0.05)
         else:
             raise Exception(
                 "Invalid distribution ==> {}".format(latent_distrib))
         self.embedding = nn.Dense(in_units=self.bow_vocab_size,
                                   units=self.wd_embed_dim,
                                   activation='tanh')
         self.encoder = nn.Dense(in_units=self.wd_embed_dim,
                                 units=200,
                                 activation='softrelu')
         #self.encoder = TransformerEncoder(self.wd_embed_dim, self.num_units, hidden_size=hidden_size, num_heads=num_heads,
         #                                  n_layers=transformer_layers, n_latent=n_latent, sent_size = max_sent_len,
         #                                  batch_size = batch_size, ctx = ctx)
         self.decoder = gluon.nn.Dense(in_units=n_latent,
                                       units=self.bow_vocab_size,
                                       activation=None)
     self.initialize(mx.init.Xavier(), ctx=self.model_ctx)
     if self.vocabulary.embedding is not None:
         emb = vocabulary.embedding.idx_to_vec
         emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=1) + 1e-10
         emb_norm = emb / emb_norm_val
         self.embedding.weight.set_data(emb_norm)
     if wd_freqs is not None:
         freq_nd = wd_freqs + 1
         total = freq_nd.sum()
         log_freq = freq_nd.log() - freq_nd.sum().log()
         bias_param = self.decoder.collect_params().get('bias')
         bias_param.set_data(log_freq)
         bias_param.grad_req = 'null'
         self.out_bias = bias_param.data()
Exemplo n.º 9
0
class TransformerBowVED(Block):
    def __init__(self,
                 bow_vocab_size,
                 vocabulary,
                 emb_dim,
                 latent_distrib='vmf',
                 num_units=512,
                 hidden_size=512,
                 num_heads=4,
                 n_latent=256,
                 max_sent_len=32,
                 transformer_layers=2,
                 kappa=100.0,
                 batch_size=16,
                 kld=0.1,
                 wd_freqs=None,
                 ctx=mx.cpu(),
                 prefix=None,
                 params=None):
        super(TransformerBowVED, self).__init__(prefix=prefix, params=params)
        self.kld_wt = kld
        self.n_latent = n_latent
        self.model_ctx = ctx
        self.max_sent_len = max_sent_len
        self.vocabulary = vocabulary
        self.batch_size = batch_size
        self.wd_embed_dim = emb_dim
        self.vocab_size = len(vocabulary.idx_to_token)
        self.bow_vocab_size = bow_vocab_size
        self.latent_distrib = latent_distrib
        self.num_units = num_units
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.transformer_layers = transformer_layers
        self.kappa = kappa
        with self.name_scope():
            if latent_distrib == 'logistic_gaussian':
                self.latent_dist = LogisticGaussianLatentDistribution(n_latent,
                                                                      ctx,
                                                                      dr=0.0)
            elif latent_distrib == 'vmf':
                self.latent_dist = HyperSphericalLatentDistribution(
                    n_latent, kappa=kappa, ctx=self.model_ctx, dr=0.0)
            elif latent_distrib == 'gaussian':
                self.latent_dist = GaussianLatentDistribution(n_latent,
                                                              ctx,
                                                              dr=0.0)
            elif latent_distrib == 'gaussian_unitvar':
                self.latent_dist = GaussianUnitVarLatentDistribution(n_latent,
                                                                     ctx,
                                                                     dr=0.0,
                                                                     var=0.05)
            else:
                raise Exception(
                    "Invalid distribution ==> {}".format(latent_distrib))
            self.embedding = nn.Embedding(self.vocab_size, self.wd_embed_dim)
            self.encoder = TransformerEncoder(self.wd_embed_dim,
                                              self.num_units,
                                              hidden_size=hidden_size,
                                              num_heads=num_heads,
                                              n_layers=transformer_layers,
                                              n_latent=n_latent,
                                              sent_size=max_sent_len,
                                              batch_size=batch_size,
                                              ctx=ctx)
            self.decoder = gluon.nn.Dense(in_units=n_latent,
                                          units=self.bow_vocab_size,
                                          activation=None)
        self.initialize(mx.init.Xavier(), ctx=self.model_ctx)
        if self.vocabulary.embedding is not None:
            emb = vocabulary.embedding.idx_to_vec
            emb_norm_val = mx.nd.norm(emb, keepdims=True, axis=1) + 1e-10
            emb_norm = emb / emb_norm_val
            self.embedding.weight.set_data(emb_norm)
        if wd_freqs is not None:
            freq_nd = wd_freqs + 1
            total = freq_nd.sum()
            log_freq = freq_nd.log() - freq_nd.sum().log()
            bias_param = self.decoder.collect_params().get('bias')
            bias_param.set_data(log_freq)
            bias_param.grad_req = 'null'
            self.out_bias = bias_param.data()

    def get_top_k_terms(self, k):
        """
        Returns the top K terms for each topic based on sensitivity analysis. Terms whose 
        probability increases the most for a unit increase in a given topic score/probability
        are those most associated with the topic. This is just the topic-term weights for a 
        linear decoder - but code here will work with arbitrary decoder.
        """
        z = mx.nd.ones(shape=(1, self.n_latent), ctx=self.model_ctx)
        jacobian = mx.nd.zeros(shape=(self.bow_vocab_size, self.n_latent),
                               ctx=self.model_ctx)
        z.attach_grad()
        for i in range(self.bow_vocab_size):
            with mx.autograd.record():
                y = self.decoder(z)
                yi = y[0][i]
            yi.backward()
            jacobian[i] = z.grad
        sorted_j = jacobian.argsort(axis=0, is_ascend=False)
        return sorted_j

    def __call__(self, wp_toks, bow):
        return super(TransformerBowVED, self).__call__(wp_toks, bow)

    def set_kl_weight(self, epoch, max_epochs):
        burn_in = int(max_epochs / 10)
        eps = 1e-6
        if epoch > burn_in:
            self.kld_wt = ((epoch - burn_in) / (max_epochs - burn_in)) + eps
        else:
            self.kld_wt = eps
        return self.kld_wt

    def encode(self, toks):
        embedded = self.embedding(toks)
        enc = self.encoder(embedded)
        return self.latent_dist.mu_encoder(enc)

    def forward(self, toks, bow):
        embedded = self.embedding(toks)
        enc = self.encoder(embedded)
        z, KL = self.latent_dist(enc, self.batch_size)
        y = self.decoder(z)
        y = mx.nd.softmax(y, axis=1)
        rr = bow * mx.nd.log(y + 1e-12)
        recon_loss = -mx.nd.sparse.sum(rr, axis=1)
        KL_loss = (KL * self.kld_wt)
        loss = recon_loss + KL_loss
        return loss, recon_loss, KL_loss, y