Пример #1
0
 def nce_loss(labels, weights, bias, predict):
     noise = uniform_sampler(labels, 1, nce_samples, True, vocab_size)
     loss = tf.nn.nce_loss(weights=weights,
                           biases=bias,
                           inputs=predict,
                           labels=labels,
                           num_sampled=nce_samples,
                           num_classes=vocab_size,
                           num_true=1,
                           sampled_values=noise)
     return tf.reduce_mean(loss)
Пример #2
0
    def __init__(self,
                 ctx_size,
                 vocab_size,
                 embed_dim,
                 embed_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 logit_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 embed_share=True,
                 use_gate=True,
                 use_hidden=False,
                 h_dim=100,
                 h_activation=tx.elu,
                 h_init=tx.he_normal_init(),
                 h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01),
                 use_dropout=True,
                 embed_dropout=False,
                 keep_prob=0.95,
                 l2_loss=False,
                 l2_loss_coef=1e-5,
                 use_nce=False,
                 nce_samples=100):

        # GRAPH INPUTS
        run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input")
        loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target")
        eval_inputs = loss_inputs

        # RUN GRAPH
        # if I create a scope here the Tensorboard graph will be a mess to read
        # because it groups everything by nested scope names
        # instead if I choose to create different scopes for train and eval only
        # the graph stays readable because it allows us to use the same names
        # under different scopes while still sharing variables
        var_reg = []
        with tf.name_scope("run"):
            feature_lookup = tx.Lookup(run_inputs,
                                       ctx_size, [vocab_size, embed_dim],
                                       embed_init,
                                       name="lookup")
            var_reg.append(feature_lookup.weights)
            feature_lookup = feature_lookup.as_concat()

            if use_gate or use_hidden:
                hl = tx.Linear(feature_lookup,
                               h_dim,
                               h_init,
                               bias=True,
                               name="h_linear")
                ha = tx.Activation(hl, h_activation, name="h_activation")
                h = tx.Compose(hl, ha, name="hidden")
                var_reg.append(hl.weights)

            features = feature_lookup
            if use_gate:
                gate_w = tx.Linear(h, ctx_size, bias=True)
                gate = tx.Gate(features, gate_input=gate_w)

                # gate = tx.Module([h, features], gate)

                features = gate
                var_reg.append(gate_w.weights)

            x_to_f = tx.Linear(features,
                               embed_dim,
                               x_to_f_init,
                               bias=True,
                               name="x_to_f")
            var_reg.append(x_to_f.weights)
            f_prediction = x_to_f

            if use_hidden:
                h_to_f = tx.Linear(h,
                                   embed_dim,
                                   h_to_f_init,
                                   bias=True,
                                   name="h_to_f")
                var_reg.append(h_to_f.weights)
                f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted")

            # RI DECODING ===============================================
            shared_weights = tf.transpose(
                feature_lookup.weights) if embed_share else None
            logit_init = logit_init if not embed_share else None
            run_logits = tx.Linear(f_prediction,
                                   vocab_size,
                                   logit_init,
                                   shared_weights,
                                   bias=True,
                                   name="logits")
            if not embed_share:
                var_reg.append(run_logits.weights)
            y_prob = tx.Activation(run_logits, tx.softmax)

        # TRAIN GRAPH ===============================================
        with tf.name_scope("train"):
            if use_dropout and embed_dropout:
                feature_lookup = feature_lookup.reuse_with(run_inputs)
                features = tx.Dropout(feature_lookup, probability=keep_prob)
            else:
                features = feature_lookup

            if use_gate or use_hidden:
                if use_dropout:
                    h = h.reuse_with(features)
                    h = tx.Dropout(h, probability=keep_prob)

                if use_gate:
                    gate_w = gate_w.reuse_with(h)
                    features = gate.reuse_with(layer=features,
                                               gate_input=gate_w)

                f_prediction = x_to_f.reuse_with(features)

                if use_hidden:
                    h_to_f = h_to_f.reuse_with(h)
                    if use_dropout:
                        h_to_f = tx.Dropout(h_to_f, probability=keep_prob)
                    f_prediction = tx.Add(f_prediction, h_to_f)
            else:
                f_prediction = f_prediction.reuse_with(features)

            train_logits = run_logits.reuse_with(f_prediction)

            if use_nce:
                # uniform gets good enough results if enough samples are used
                # but we can load the empirical unigram distribution
                # or learn the unigram distribution during training
                sampled_values = uniform_sampler(loss_inputs.tensor, 1,
                                                 nce_samples, True, vocab_size)
                train_loss = tf.nn.nce_loss(weights=tf.transpose(
                    train_logits.weights),
                                            biases=train_logits.bias,
                                            inputs=f_prediction.tensor,
                                            labels=loss_inputs.tensor,
                                            num_sampled=nce_samples,
                                            num_classes=vocab_size,
                                            num_true=1,
                                            sampled_values=sampled_values)
            else:
                one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor,
                                           num_cols=vocab_size)
                train_loss = tx.categorical_cross_entropy(
                    one_hot, train_logits.tensor)

            train_loss = tf.reduce_mean(train_loss)

            if l2_loss:
                losses = [tf.nn.l2_loss(var) for var in var_reg]
                train_loss = train_loss + l2_loss_coef * tf.add_n(losses)

        # EVAL GRAPH ===============================================
        with tf.name_scope("eval"):
            one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor,
                                       num_cols=vocab_size)
            eval_loss = tx.categorical_cross_entropy(one_hot,
                                                     run_logits.tensor)
            eval_loss = tf.reduce_mean(eval_loss)

        # SETUP MODEL CONTAINER ====================================
        super().__init__(run_inputs=run_inputs,
                         run_outputs=y_prob,
                         train_inputs=run_inputs,
                         train_outputs=y_prob,
                         eval_inputs=run_inputs,
                         eval_outputs=y_prob,
                         train_out_loss=train_loss,
                         train_in_loss=loss_inputs,
                         eval_out_score=eval_loss,
                         eval_in_score=eval_inputs)
from matplotlib.colors import BoundaryNorm
from matplotlib.ticker import MaxNLocator
import numpy as np
import pandas as pd

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

labels = np.array([[0], [2]])
num_samples = 2
num_classes = 10

tf.enable_eager_execution()

sampled_indices = uniform_sampler(true_classes=labels,
                                  num_true=1,
                                  num_sampled=num_samples,
                                  unique=False,
                                  range_max=num_classes,
                                  seed=None)

with tf.device("/gpu:0"):
    noise_ids, target_noise_prob, noise_prob = sampled_indices

print(noise_ids)
print(target_noise_prob / num_samples)
print(noise_prob / num_samples)

print()

v = 1000
k = 10  # [1,v-1]
batch_size = 128