示例#1
0
def calculate_softmax(prefix_words_id ,hps):
    softmax_w = sharded_variable(
        'softmax_w', [hps.vocab_size, hps.projected_size], hps.num_shards)
    softmax_b = tf.get_variable('softmax_b', [hps.vocab_size])

    full_softmax_w = tf.reshape(
        tf.concat(1, softmax_w), [-1, hps.projected_size])
    full_softmax_w = full_softmax_w[:hps.vocab_size, :]

    logits = (tf.matmul(prefix_words_id, full_softmax_w, transpose_b=True) +
              softmax_b)
    softmax = tf.nn.softmax(logits)
    return softmax[len(prefix_words_id)]
示例#2
0
    def _forward(self, gpu, x, y):
        print("Setting up forward pass on GPU:%d" % gpu)
        hps = self.hps
        self.initial_states = []
        for i in range(hps.num_layers):
            with tf.device("/gpu:%d" % gpu):
                state = (
                    tf.Variable(tf.zeros([hps.batch_size, hps.state_size],
                                         dtype=getdtype(hps, True)),
                                trainable=False,
                                collections=[tf.GraphKeys.LOCAL_VARIABLES],
                                name="state_c_%d_%d" % (gpu, i),
                                dtype=getdtype(hps, True)),
                    tf.Variable(tf.zeros([hps.batch_size, hps.projected_size],
                                         dtype=getdtype(hps, True)),
                                trainable=False,
                                collections=[tf.GraphKeys.LOCAL_VARIABLES],
                                name="state_h_%d_%d" % (gpu, i),
                                dtype=getdtype(hps, True)),
                )
                self.initial_states += [state]

        emb_vars = sharded_variable("emb", [hps.vocab_size, hps.emb_size],
                                    hps.num_shards,
                                    dtype=getdtype(hps, False))

        x = tf.nn.embedding_lookup(emb_vars, x)  # [bs, steps, emb_size]
        if hps.keep_prob < 1.0:
            x = tf.nn.dropout(x, hps.keep_prob)

        inputs = [
            tf.squeeze(input=tf.cast(v, getdtype(hps, True)), axis=[1]) for v
            in tf.split(value=x, num_or_size_splits=hps.num_steps, axis=1)
        ]
        for i in range(hps.num_layers):
            with tf.variable_scope("lstm_%d" % i) as scope:
                if hps.num_of_groups > 1:
                    assert (hps.fact_size is None)
                    print("Using G-LSTM")
                    print("Using %d groups" % hps.num_of_groups)
                    cell = GLSTMCell(num_units=hps.state_size,
                                     num_proj=hps.projected_size,
                                     number_of_groups=hps.num_of_groups)
                else:
                    if hps.fact_size:
                        print("Using F-LSTM")
                        print("Using factorization: %d x %d x %d" %
                              (2 * hps.projected_size, int(
                                  hps.fact_size), 4 * hps.state_size))
                        cell = FLSTMCell(num_units=hps.state_size,
                                         num_proj=hps.projected_size,
                                         factor_size=int(hps.fact_size))
                    else:
                        print("Using LSTMP")
                        cell = LSTMCell(num_units=hps.state_size,
                                        num_proj=hps.projected_size)

                state = tf.contrib.rnn.LSTMStateTuple(
                    self.initial_states[i][0], self.initial_states[i][1])

                if hps.use_residual:
                    cell = ResidualWrapper(cell=cell)

                for t in range(hps.num_steps):
                    if t > 0:
                        scope.reuse_variables()
                    inputs[t], state = cell(inputs[t], state)
                    if hps.keep_prob < 1.0:
                        inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob)

                with tf.control_dependencies([
                        self.initial_states[i][0].assign(state[0]),
                        self.initial_states[i][1].assign(state[1])
                ]):
                    inputs[t] = tf.identity(inputs[t])

                # inputs = tf.reshape(tf.concat(1, inputs), [-1, hps.projected_size])
        inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size])

        # Initialization ignores the fact that softmax_w is transposed. Twhat worked slightly better.
        softmax_w = sharded_variable("softmax_w",
                                     [hps.vocab_size, hps.projected_size],
                                     hps.num_shards)
        softmax_b = tf.get_variable("softmax_b", [hps.vocab_size])

        if hps.num_sampled == 0:
            full_softmax_w = tf.reshape(tf.concat(softmax_w, 1),
                                        [-1, hps.projected_size])
            full_softmax_w = full_softmax_w[:hps.vocab_size, :]

            logits = tf.matmul(tf.to_float(inputs),
                               full_softmax_w,
                               transpose_b=True) + softmax_b
            targets = tf.reshape(y, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=logits, labels=targets)
        else:
            targets = tf.reshape(y, [-1, 1])
            loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, targets,
                                              tf.to_float(inputs),
                                              hps.num_sampled, hps.vocab_size)
        #loss = tf.reduce_mean(loss * tf.reshape(w, [-1]))
        loss = tf.reduce_mean(loss)
        return loss
示例#3
0
    def _forward(self, gpu, x, y, w):
        hps = self.hps
        w = tf.to_float(w)
        self.initial_states = []
        for i in range(hps.num_layers):
            with tf.device("/gpu:%d" % gpu):
                v = tf.Variable(tf.zeros(
                    [hps.batch_size, hps.state_size + hps.projected_size]),
                                trainable=False,
                                collections=[tf.GraphKeys.LOCAL_VARIABLES],
                                name="state_%d_%d" % (gpu, i))
                self.initial_states += [v]

        emb_vars = sharded_variable("emb", [hps.vocab_size, hps.emb_size],
                                    hps.num_shards)

        x = tf.nn.embedding_lookup(emb_vars, x)  # [bs, steps, emb_size]
        if hps.keep_prob < 1.0:
            x = tf.nn.dropout(x, hps.keep_prob)

        inputs = [tf.squeeze(v, [1]) for v in tf.split(1, hps.num_steps, x)]

        for i in range(hps.num_layers):
            with tf.variable_scope("lstm_%d" % i):
                cell = LSTMCell(hps.state_size,
                                hps.emb_size,
                                num_proj=hps.projected_size)

            state = self.initial_states[i]
            for t in range(hps.num_steps):
                inputs[t], state = cell(inputs[t], state)
                if hps.keep_prob < 1.0:
                    inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob)

            with tf.control_dependencies(
                [self.initial_states[i].assign(state)]):
                inputs[t] = tf.identity(inputs[t])

        inputs = tf.reshape(tf.concat(1, inputs), [-1, hps.projected_size])

        # Initialization ignores the fact that softmax_w is transposed. That worked slightly better.
        softmax_w = sharded_variable("softmax_w",
                                     [hps.vocab_size, hps.projected_size],
                                     hps.num_shards)
        softmax_b = tf.get_variable("softmax_b", [hps.vocab_size])

        if hps.num_sampled == 0:
            full_softmax_w = tf.reshape(tf.concat(1, softmax_w),
                                        [-1, hps.projected_size])
            full_softmax_w = full_softmax_w[:hps.vocab_size, :]

            logits = tf.matmul(inputs, full_softmax_w,
                               transpose_b=True) + softmax_b
            # targets = tf.reshape(tf.transpose(self.y), [-1])
            targets = tf.reshape(y, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, targets)
        else:
            targets = tf.reshape(y, [-1, 1])
            loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b,
                                              tf.to_float(inputs), targets,
                                              hps.num_sampled, hps.vocab_size)

        loss = tf.reduce_mean(loss * tf.reshape(w, [-1]))
        return loss
示例#4
0
    def __init__(self, hps):
        self.x = tf.placeholder(tf.int32, [1])
        self.ind = tf.placeholder(tf.int32, name="ind")
        # self.ind_len = tf.placeholder(tf.int32, name="ind_len")
        self.initial_state = tf.Variable(
            tf.zeros([hps.batch_size, hps.state_size + hps.projected_size]),
            trainable=False,
            collections=[tf.GraphKeys.LOCAL_VARIABLES],
            name="state_0_0")

        emb_vars = sharded_variable(
            "emb", [hps.vocab_size, hps.emb_size],
            hps.num_shards)  # vocab_size is too big for this model
        x = tf.nn.embedding_lookup(emb_vars,
                                   self.x)  # [batch_size, steps, emb_size]
        if hps.keep_prob < 1.0:
            x = tf.nn.dropout(x, hps.keep_prob)

        # [batch_size,emb_size]*steps
        #inputs = [tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, 1)]
        inputs = x
        with tf.variable_scope("lstm_0"):
            cell = LSTMCell(hps.state_size,
                            hps.emb_size,
                            num_proj=hps.projected_size)

        state = self.initial_state
        inputs, self.final_state = cell(
            inputs, state
        )  # inputs[t] is h{t}??  state is h(t) ?? result of inputs[t] is project ??
        if hps.keep_prob < 1.0:
            inputs = tf.nn.dropout(inputs, hps.keep_prob)
        inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size])
        # Initialization ignores the fact that softmax_w is transposed. That worked slightly better.
        softmax_w = sharded_variable("softmax_w",
                                     [hps.vocab_size, hps.projected_size],
                                     hps.num_shards)
        softmax_b = tf.get_variable("softmax_b", [hps.vocab_size])

        full_softmax_w = tf.reshape(tf.concat(softmax_w, 1),
                                    [-1, hps.projected_size])
        full_softmax_w = full_softmax_w[:hps.vocab_size, :]
        # [batch_size*steps,vocab]
        self.logits = tf.matmul(inputs, full_softmax_w,
                                transpose_b=True) + softmax_b
        self.logits = tf.squeeze(self.logits)
        """
        ind_logits = tf.reshape(self.logits, [hps.vocab_size, -1])
        ind_logits = tf.gather(ind_logits, self.ind)
        print  "ind_logits:", self.logits, ind_logits
        ind_logits = tf.reshape(ind_logits, [-1, self.ind_len])
        self.top_k = tf.minimum(self.ind_len, hps.arg_max)
        # for complete
        _, self.ind_index = tf.nn.top_k(ind_logits, self.top_k)
        print  "ind_index:", self.ind_index
        """
        # for predict
        _, self.index = tf.nn.top_k(self.logits, hps.arg_max)
        # for ema
        ema = tf.train.ExponentialMovingAverage(decay=0.999)
        variables_to_average = find_trainable_variables("LSTM")
        self.avg_dict = ema.variables_to_restore(variables_to_average)
示例#5
0
    def _forward(self, gpu, x, y, mode="train"):
        hps = self.hps
        self.initial_states = []
        #every layer has a initial_state :tf.zero([hps.batch_size, hps.state_size + hps.projected_size])
        for i in range(hps.num_layers):
            with tf.device("/gpu:%d" % gpu):
                v = tf.Variable(tf.zeros(
                    [hps.batch_size, hps.state_size + hps.projected_size]),
                                trainable=False,
                                collections=[tf.GraphKeys.LOCAL_VARIABLES],
                                name="state_%d_%d" % (gpu, i))
                #self.initial_states += [v]
                self.initial_states = v  # for layers = 1

        emb_vars = sharded_variable(
            "emb", [hps.vocab_size, hps.emb_size],
            hps.num_shards)  #vocab_size is too big for this model
        x = tf.nn.embedding_lookup(emb_vars,
                                   x)  # [batch_size, steps, emb_size]
        if hps.keep_prob < 1.0:
            x = tf.nn.dropout(x, hps.keep_prob)

        #  [batch_size,emb_size]*steps
        inputs = [tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, 1)]

        for i in range(hps.num_layers):
            with tf.variable_scope("lstm_%d" % i):
                cell = LSTMCell(hps.state_size,
                                hps.emb_size,
                                num_proj=hps.projected_size)

            state = self.initial_states
            for t in range(hps.num_steps):  # step
                inputs[t], state = cell(
                    inputs[t], state
                )  #inputs[t] is h{t}??  state is h(t) ?? result of inputs[t] is project ??
                if hps.keep_prob < 1.0:
                    inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob)

            #with tf.control_dependencies([self.initial_states.assign(state)]):   # for bi-lstm? or two layer lstm
            #    inputs[t] = tf.identity(inputs[t])

        self.final_state = state
        # [batch_size*steps,projected_size]
        inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size])

        # Initialization ignores the fact that softmax_w is transposed. That worked slightly better.
        softmax_w = sharded_variable("softmax_w",
                                     [hps.vocab_size, hps.projected_size],
                                     hps.num_shards)
        softmax_b = tf.get_variable("softmax_b", [hps.vocab_size])

        if hps.num_sampled == 0:
            full_softmax_w = tf.reshape(tf.concat(softmax_w, 1),
                                        [-1, hps.projected_size])
            full_softmax_w = full_softmax_w[:hps.vocab_size, :]

            # [batch_size*steps,vocab]
            logits = tf.matmul(inputs, full_softmax_w,
                               transpose_b=True) + softmax_b
            print("train log logits:", logits)
            # targets = tf.reshape(tf.transpose(self.y), [-1])
            # index = tf.argmax(logits,axis=1)

            #_, index = tf.nn.top_k(logits, hps.arg_max)
            #print "index:",index

            targets = tf.reshape(y, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=targets, logits=logits)
            #loss = tf.reduce_mean(loss * tf.reshape(w, [-1]))
            loss = tf.reduce_mean(loss)

            if mode == "predict_next":
                return loss, logits

        else:
            targets = tf.reshape(y, [-1, 1])

            loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, targets,
                                              tf.to_float(inputs),
                                              hps.num_sampled, hps.vocab_size)

        loss = tf.reduce_mean(loss)
        return loss
示例#6
0
    def _forward(self, gpu, x, y, w):
        hps = self.hps
        w = tf.to_float(w)
        self.initial_states = []
        for i in range(hps.num_layers):
            with tf.device('/gpu:%d' % gpu):
                v = tf.Variable(tf.zeros(
                    [self.batch_size, hps.state_size + hps.projected_size]),
                                trainable=False,
                                collections=['initial_state'],
                                name='state_%d_%d' % (gpu, i),
                                validate_shape=False)
                self.initial_states += [v]

        emb_vars = sharded_variable('emb', [hps.vocab_size, hps.emb_size],
                                    hps.num_shards)

        x = tf.nn.embedding_lookup(emb_vars, x)  # [bs, steps, emb_size]
        if hps.keep_prob < 1.0:
            x = tf.nn.dropout(x, hps.keep_prob)

        inputs = [
            tf.squeeze(v, [1]) for v in tf.split(x, hps.num_steps, axis=1)
        ]

        for i in range(hps.num_layers):
            with tf.variable_scope('lstm_%d' % i):
                cell = LSTMCell(hps.state_size,
                                hps.emb_size,
                                num_proj=hps.projected_size)

            state = self.initial_states[i]
            for t in range(hps.num_steps):
                inputs[t], state = cell(inputs[t], state)
                if hps.keep_prob < 1.0:
                    inputs[t] = tf.nn.dropout(inputs[t], hps.keep_prob)

            with tf.control_dependencies(
                [self.initial_states[i].assign(state)]):
                inputs[t] = tf.identity(inputs[t])

        inputs = tf.reshape(tf.concat(inputs, 1), [-1, hps.projected_size])
        tf.add_to_collection('hidden_output', inputs)

        # Initialization ignores the fact that softmax_w is transposed.
        # That worked slightly better.
        softmax_w = sharded_variable('softmax_w',
                                     [hps.vocab_size, hps.projected_size],
                                     hps.num_shards)
        softmax_b = tf.get_variable('softmax_b', [hps.vocab_size])

        full_softmax_w = tf.reshape(tf.concat(softmax_w, 1),
                                    [-1, hps.projected_size])
        full_softmax_w = full_softmax_w[:hps.vocab_size, :]

        logits = (tf.matmul(inputs, full_softmax_w, transpose_b=True) +
                  softmax_b)
        softmax = tf.nn.softmax(logits)
        tf.add_to_collection('softmax', softmax)
        if hps.num_sampled == 0:
            # targets = tf.reshape(tf.transpose(self.y), [-1])
            targets = tf.reshape(y, [-1])
            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=targets, logits=logits)
        else:
            targets = tf.reshape(y, [-1, 1])
            loss = tf.nn.sampled_softmax_loss(weights=softmax_w,
                                              biases=softmax_b,
                                              labels=targets,
                                              inputs=tf.to_float(inputs),
                                              num_sampled=hps.num_sampled,
                                              num_classes=hps.vocab_size)

        loss = tf.reduce_mean(loss * tf.reshape(w, [-1]))
        return loss