Exemplo n.º 1
0
output_matrix = tf.get_variable("output", [conf.num_hidden_state, conf.vocab_size], 
                    tf.float32, initializer=xavier_initializer())
output_bias = tf.get_variable("bias", [conf.vocab_size], 
                    tf.float32, initializer=xavier_initializer())

# embedding lookup
word_embeddings = tf.nn.embedding_lookup(embedding_matrix, data) # shape: (64, 29, 1, 100)
word_embeddings = tf.reshape(word_embeddings, [conf.batch_size, conf.seq_length -1, conf.embed_size]) #shape: (64, 29, 100)
assert word_embeddings.shape == (conf.batch_size, conf.seq_length - 1, conf.embed_size)

# RNN unrolling
print("creating RNN")
lstm_outputs = []
with tf.variable_scope("rnn") as scope:
    cell = LSTMCell(conf.num_hidden_state)
    state = cell.zero_state(conf.batch_size, tf.float32)
    for i in range(conf.seq_length - 1):
        if i > 0:
            scope.reuse_variables()
        lstm_output, state = cell(word_embeddings[:, i, :], state)
        lstm_outputs.append(lstm_output)

# stack the outputs together, reshape, multiply
lstm_outputs = tf.stack(lstm_outputs, axis = 1)
lstm_outputs = tf.reshape(lstm_outputs, [conf.batch_size * (conf.seq_length - 1), conf.num_hidden_state])
assert lstm_outputs.shape == (conf.batch_size * (conf.seq_length - 1), conf.num_hidden_state)
predictions = tf.matmul(lstm_outputs, output_matrix) + output_bias

# reshape the labels
labels = tf.reshape(next_word, [conf.batch_size * (conf.seq_length - 1)])
Exemplo n.º 2
0
# tf.matmul(convW,encoder_final_state)

# Decoder
if mode == 1:
    attention_states = tf.transpose(encoder_outputs, [1, 0, 2])
    attention_mechanism = tf.contrib.seq2seq.LuongAttention(
        decoder_hidden_units,
        attention_states,
        memory_sequence_length=enc_seqLen)

    decoder_cell = LSTMCell(decoder_hidden_units)
    decoder_cell = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                       attention_mechanism,
                                                       attention_layer_size=2 *
                                                       decoder_hidden_units)
    encoder_final_state = decoder_cell.zero_state(dtype=tf.float32,
                                                  batch_size=batchSize)
else:
    if encoder_choice == 0:
        decoder_cell = LSTMCell(2 * decoder_hidden_units)
    else:
        decoder_cell = LSTMCell(decoder_hidden_units)

projection_layer = layers_core.Dense(dec_vocab_size,
                                     use_bias=False,
                                     name="output_projection")

# Attention Is All You Need

#Training Helper
helper_1 = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embedded_input,
                                             sequence_length=dec_seqLen,
Exemplo n.º 3
0
running-time. The output sahpe for different early_stop is same, hence,
it is not clear what is the value of output for the timesteps which was not
computed.
"""
import tensorflow as tf
from tensorflow.contrib.rnn import static_rnn
from tensorflow.contrib.rnn import LSTMCell
from time import time
import numpy as np

initializer = tf.random_uniform_initializer(-1, 1)
seq_input = tf.placeholder(shape=[200, 10, 2048], dtype=tf.float32)
inputs = [ts[0] for ts in tf.split(seq_input, 200, axis=0)]
early_stop = tf.placeholder(dtype=tf.int32)
cell = LSTMCell(2048, initializer=initializer)
initial_state = cell.zero_state(10, dtype=tf.float32)
outputs, states = static_rnn(
    cell,
    inputs,
    initial_state=initial_state,
    sequence_length=early_stop
    )


sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
saver = tf.train.Saver()


input_values = np.random.uniform(size=(200, 10, 2048)).astype('float32')
def build_inference_graph(params_dict):
    # Todo: Check if load the glove is faster than import it from embedding
    # glove = np.load('GloVe/glove.npy')

    # Building the Embedding layer + placeholders
    keep_prob = tf.placeholder(tf.float32)
    embedding = tf.get_variable("embedding", initializer=glove, trainable=True)
    document_tokens = tf.placeholder(tf.int32, shape=[None, None], name="document_tokens")
    document_emb = tf.nn.embedding_lookup(embedding, document_tokens)
    answer_masks = tf.placeholder(tf.float32, shape=[None, None, None], name="answer_masks")
    encoder_lengths = tf.placeholder(tf.int32, shape=[None], name="encoder_lengths")
    projection = Dense(embedding.shape[0], use_bias=False)

    helper = seq2seq.GreedyEmbeddingHelper(embedding, tf.fill([batch_size],
                                                              START_TOKEN), END_TOKEN)

    # Building the Encoder
    encoder_inputs = tf.matmul(answer_masks, document_emb, name="encoder_inputs")

    output = encoder_inputs
    for n in range(params_dict["num_encoder_layers"]):
        cell_fw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True)
        cell_bw = LSTMCell(params_dict["lstm_units"], forget_bias=1.0, state_is_tuple=True)
        cell_fw = DropoutWrapper(cell_fw, output_keep_prob=keep_prob, )
        cell_bw = DropoutWrapper(cell_bw, output_keep_prob=keep_prob, )

        state_fw = cell_fw.zero_state(params_dict["batch_size"], tf.float32)
        state_bw = cell_bw.zero_state(params_dict["batch_size"], tf.float32)

        (output_fw, output_bw), encoder_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, output,
                                                                                initial_state_fw=state_fw,
                                                                                initial_state_bw=state_bw,
                                                                                sequence_length=encoder_lengths,
                                                                                dtype=tf.float32,
                                                                                scope='encoder_rnn_' + str(n))
        output = tf.concat([output_fw, output_bw], axis=2)

    encoder_final_output = output
    encoder_state_c = tf.concat((encoder_state[0][0], encoder_state[1][0]), -1)
    encoder_state_h = tf.concat((encoder_state[0][1], encoder_state[1][1]), -1)
    encoder_final_state = LSTMStateTuple(encoder_state_c, encoder_state_h)

    # Attention mechanism
    attention_mechanism = seq2seq.LuongAttention(
        num_units=params_dict["lstm_units"] * 2,
        memory=encoder_final_output,
        memory_sequence_length=encoder_lengths)

    # Building the Decoder
    temp_cell = LSTMCell(params_dict["lstm_units"] * 2, forget_bias=1.0)
    temp_cell = DropoutWrapper(temp_cell, output_keep_prob=keep_prob, )
    decoder_cell = seq2seq.AttentionWrapper(
        cell=temp_cell,
        attention_mechanism=attention_mechanism,
        attention_layer_size=params_dict["lstm_units"] * 2)

    decoder = seq2seq.BasicDecoder(
        cell=decoder_cell,
        helper=helper,
        initial_state=decoder_cell.zero_state(params_dict["batch_size"], tf.float32).clone(cell_state=encoder_final_state),
        output_layer=projection)

    decoder_outputs, _, _ = seq2seq.dynamic_decode(decoder, maximum_iterations=16)
    decoder_outputs = decoder_outputs.rnn_output

    # Normalize the logits between [0,1]
    prob_logits = tf.nn.softmax(decoder_outputs, axis=-1)

    return {
        "keep_prob": keep_prob,
        "document_tokens": document_tokens,
        "answer_masks": answer_masks,
        "encoder_lengths": encoder_lengths,
        "decoder_outputs": decoder_outputs,
        "prob_logits": prob_logits
    }
def build_train_graph(params_dict):
    # Building the Embedding layer + placeholders
    keep_prob = tf.placeholder(tf.float32)
    embedding = tf.get_variable("embedding", initializer=glove, trainable=True)
    document_tokens = tf.placeholder(tf.int32,
                                     shape=[None, None],
                                     name="document_tokens")
    document_emb = tf.nn.embedding_lookup(embedding, document_tokens)
    answer_masks = tf.placeholder(tf.float32,
                                  shape=[None, None, None],
                                  name="answer_masks")
    decoder_inputs = tf.placeholder(tf.int32,
                                    shape=[None, None],
                                    name="decoder_inputs")
    decoder_labels = tf.placeholder(tf.int32,
                                    shape=[None, None],
                                    name="decoder_labels")
    decoder_lengths = tf.placeholder(tf.int32,
                                     shape=[None],
                                     name="decoder_lengths")
    encoder_lengths = tf.placeholder(tf.int32,
                                     shape=[None],
                                     name="encoder_lengths")
    decoder_emb = tf.nn.embedding_lookup(embedding, decoder_inputs)
    question_mask = tf.sequence_mask(decoder_lengths, dtype=tf.float32)
    projection = Dense(embedding.shape[0], use_bias=False)

    training_helper = seq2seq.TrainingHelper(inputs=decoder_emb,
                                             sequence_length=decoder_lengths,
                                             time_major=False)

    # Building the Encoder
    encoder_inputs = tf.matmul(answer_masks,
                               document_emb,
                               name="encoder_inputs")

    output = encoder_inputs
    for n in range(params_dict["num_encoder_layers"]):
        cell_fw = LSTMCell(params_dict["lstm_units"],
                           forget_bias=1.0,
                           state_is_tuple=True)
        cell_bw = LSTMCell(params_dict["lstm_units"],
                           forget_bias=1.0,
                           state_is_tuple=True)
        cell_fw = DropoutWrapper(
            cell_fw,
            output_keep_prob=keep_prob,
        )
        cell_bw = DropoutWrapper(
            cell_bw,
            output_keep_prob=keep_prob,
        )

        state_fw = cell_fw.zero_state(params_dict["batch_size"], tf.float32)
        state_bw = cell_bw.zero_state(params_dict["batch_size"], tf.float32)

        (output_fw,
         output_bw), encoder_state = tf.nn.bidirectional_dynamic_rnn(
             cell_fw,
             cell_bw,
             output,
             initial_state_fw=state_fw,
             initial_state_bw=state_bw,
             sequence_length=encoder_lengths,
             dtype=tf.float32,
             scope='encoder_rnn_' + str(n))
        output = tf.concat([output_fw, output_bw], axis=2)

    encoder_final_output = output
    encoder_state_c = tf.concat((encoder_state[0][0], encoder_state[1][0]), -1)
    encoder_state_h = tf.concat((encoder_state[0][1], encoder_state[1][1]), -1)
    encoder_final_state = LSTMStateTuple(encoder_state_c, encoder_state_h)

    # Attention mechanism
    attention_mechanism = seq2seq.LuongAttention(
        num_units=params_dict["lstm_units"] * 2,
        memory=encoder_final_output,
        memory_sequence_length=encoder_lengths)

    # Building the Decoder
    temp_cell = LSTMCell(params_dict["lstm_units"] * 2, forget_bias=1.0)
    temp_cell = DropoutWrapper(
        temp_cell,
        output_keep_prob=keep_prob,
    )
    decoder_cell = seq2seq.AttentionWrapper(
        cell=temp_cell,
        attention_mechanism=attention_mechanism,
        attention_layer_size=params_dict["lstm_units"] * 2)

    training_decoder = seq2seq.BasicDecoder(
        cell=decoder_cell,
        helper=training_helper,
        initial_state=decoder_cell.zero_state(
            params_dict["batch_size"],
            tf.float32).clone(cell_state=encoder_final_state),
        output_layer=projection)

    training_decoder_output, _, _ = seq2seq.dynamic_decode(
        decoder=training_decoder,
        impute_finished=True,
        maximum_iterations=tf.reduce_max(decoder_lengths))

    training_logits = training_decoder_output.rnn_output
    # Normalize the logits between [0,1]
    prob_logits = tf.nn.softmax(training_logits, axis=-1)

    loss = seq2seq.sequence_loss(logits=training_logits,
                                 targets=decoder_labels,
                                 weights=question_mask,
                                 name="loss")

    return {
        "keep_prob": keep_prob,
        "document_tokens": document_tokens,
        "answer_masks": answer_masks,
        "encoder_lengths": encoder_lengths,
        "decoder_inputs": decoder_inputs,
        "decoder_labels": decoder_labels,
        "decoder_lengths": decoder_lengths,
        "training_logits": training_logits,
        "prob_logits": prob_logits,
        "loss": loss
    }
Exemplo n.º 6
0
def runMoreLstm(path=None, epochs=10, saveResult=True):
	trainData, validData, testData, wordId = loadWordIdsFromFiles()
	trainData = np.array(trainData, np.float32)
	# validData = np.array(validData, np.float32)
	testData = np.array(testData, np.float32)
	vocabSz = len(wordId)

	info = loadInfo('lstm_ped', path)
	learnRate = info['learning rate']
	batchSz = info['batch size']
	embedSz = info['embed size']
	rnnSz = info['rnn size']
	winSz = info['win size']
	numWin = (trainData.shape[0] - 1) // (batchSz * winSz)
	# each batch has winSz * numWin words
	batchLen = winSz * numWin

	testNumWin = (testData.shape[0] - 1) // (batchSz * winSz)
	testBatchLen = winSz * testNumWin

	inp = tf.placeholder(tf.int32, shape=[batchSz, winSz])
	ans = tf.placeholder(tf.int32, shape=[batchSz * winSz])

	E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1))
	embed = tf.nn.embedding_lookup(E, inp)

	rnn = LSTMCell(rnnSz)
	initialState = rnn.zero_state(batchSz, tf.float32)
	output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState)
	output = tf.reshape(output, [batchSz * winSz, rnnSz])

	W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1))
	B = tf.Variable(tf.random_normal([vocabSz], stddev=.1))
	logits = tf.matmul(output, W) + B

	ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans)
	loss = tf.reduce_sum(ents)
	train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss)

	trainPerp = np.zeros(epochs + 1, dtype=np.float32)
	trainPerp[0] = info['train perplexity']
	testPerp = np.zeros(epochs + 1, dtype=np.float32)
	testPerp[0] = info['test perplexity']
	with tf.Session() as sess:
		loadSession(sess, 'lstm_ped', path)
		startTime = time.time()
		epoch = 0
		print('epoch:', end=' ')
		while epoch < epochs:
			epoch += 1
			win = 0
			state = sess.run(initialState)
			testState = sess.run(initialState)
			# print(state, testState)
			winStart, winEnd = 0, winSz
			while win < numWin:
				inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)])
				inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
				_, state, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: state})
				trainPerp[epoch] += outLoss
				if win < testNumWin:
					inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)])
					inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
					testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState})
					testPerp[epoch] += testOutLoss
				winStart, winEnd = winEnd, winEnd + winSz
				win += 1
			print(epoch + info['epochs'], end=' ')
		trainPerp[1:] = np.exp(trainPerp[1:] / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen)))
		testPerp[1:] = np.exp(testPerp[1:] / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen)))
		print(f'\nelapsed: {time.time() - startTime}')
		print('train perplexity:', trainPerp[-1])
		print('test perplexity:', testPerp[-1])

		info['epochs'] += epochs
		info['train perplexity'] = trainPerp[-1]
		info['test perplexity'] = testPerp[-1]
		if saveResult:
			save(sess, info)
	drawPerplexity(trainPerp, testPerp, info['epochs'] - epochs)
Exemplo n.º 7
0
    def create_critic_network(self, Scope):
        inputs = tf.placeholder(shape=[1, self.max_lenth],
                                dtype=tf.int32,
                                name="inputs")
        action = tf.placeholder(shape=[1, self.max_lenth],
                                dtype=tf.int32,
                                name="action")
        action_pos = tf.placeholder(shape=[1, None],
                                    dtype=tf.int32,
                                    name="action_pos")
        lenth = tf.placeholder(shape=[1], dtype=tf.int32, name="lenth")
        lenth_up = tf.placeholder(shape=[1], dtype=tf.int32, name="lenth_up")

        #Lower network
        if Scope[-1] == 'e':
            vec = tf.nn.embedding_lookup(self.wordvector, inputs)
            print("active")
        else:
            vec = tf.nn.embedding_lookup(self.target_wordvector, inputs)
            print("target")
        cell = LSTMCell(self.dim, initializer=self.init, state_is_tuple=False)
        self.state_size = cell.state_size
        actions = tf.to_float(action)
        h = cell.zero_state(1, tf.float32)
        print('h:', h)
        embedding = []
        for step in range(self.max_lenth):
            with tf.variable_scope("Lower/" + Scope, reuse=True):
                o, h = cell(vec[:, step, :], h)
            embedding.append(o[0])
            h = h * (1.0 - actions[0, step])

        #Upper network
        embedding = tf.stack(embedding)
        embedding = tf.gather(embedding, action_pos, name="Upper_input")
        with tf.variable_scope("Upper", reuse=True):
            out, _ = tf.nn.bidirectional_dynamic_rnn(cell,
                                                     cell,
                                                     embedding,
                                                     lenth_up,
                                                     dtype=tf.float32,
                                                     scope=Scope)

        if self.isAttention:
            out = tf.concat(out, 2)
            out = out[0, :, :]
            tmp = tflearn.fully_connected(out,
                                          self.dim,
                                          scope=Scope,
                                          name="att")
            tmp = tflearn.tanh(tmp)
            with tf.variable_scope(Scope):
                v_T = tf.get_variable("v_T",
                                      dtype=tf.float32,
                                      shape=[self.dim, 1],
                                      trainable=True)
            a = tflearn.softmax(tf.matmul(tmp, v_T))
            out = tf.reduce_sum(out * a, 0)
            out = tf.expand_dims(out, 0)
        else:
            #out = embedding[:, -1, :]
            out = tf.concat((out[0][:, -1, :], out[1][:, 0, :]), 1)

        out = tflearn.dropout(out, self.keep_prob)
        out = tflearn.fully_connected(out,
                                      self.grained,
                                      scope=Scope + "/pred",
                                      name="get_pred")
        return inputs, action, action_pos, lenth, lenth_up, out
Exemplo n.º 8
0
def core_rnn_net(config, images_ph):
    """
    Inputs:
    :param config
    :param images_ph: (batch_size, image_size, image_size, num_channels)
    
    Returns:
    :param outputs: a list/tuple of length num_glimpse with outputs/hidden_states (batch_size, cell_dim) from rnn network
    :param loc_means: a list of length num_glimpse with all output coordinate mean of loc_net
    :param loc_samples: a list of length num_glimpse with all sampled next_loc from loc_net
    """

    cell_dim = config.get("cell_dim", None)
    num_glimpses = config.get("num_glimpses", None)
    loc_dim = config.get("loc_dim", None)
    batch_size = tf.shape(images_ph)[0]

    with tf.variable_scope("glimpse_net", reuse=None):
        glimpse_net = GlimpseNet(config, images_ph)

    with tf.variable_scope("loc_net", reuse=None):
        loc_net = LocNet(config)

        ###############################################
        # TODO: the core rnn network and use LSTM cell#
        ###############################################
        # First, set up initial variables. For example,
        # init_loc: randomly and uniformly sampled
        #           from -1 to 1. You can use "tf.random_uniform" here.
        # init_g: the first input into rnn core network.
        #
        # Then define a LSTM cell by tf.contrib.rnn.LSTMCell as you did in task 1.
        # Also, you need to initialize the state values, and you can use "zero_state"
        # function. Go to https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/LSTMCell
        # and get familiar with this class.
        #
        # Finally, create a RNN with time_steps = num_glimpses
        # Be careful that in this RNN, you need to feed the output next_loc back to the input
        # of the glimpse net.
        # Hint: There are many ways to feed the output back
        # to the input of the rnn network,
        # 1. for loop
        # 2. use tf.contrib.legacy_seq2seq modules and define the
        #    "loop function" to generate next input from the output
        #    of the current step. This method is recommended, since
        #    it will help you get familiar with some advanced RNN modules
        #    provided by tensorflow.
        #    In detail, a "loop function" should look like,
        #    def loop_fn(h_t, t):
        #        loc_mean, next_loc = loc_net(h_t)
        #        next_glimpse = glimpse_net(next_loc)
        #        return next_glimpse
        #
        #    Also, you need to cache the loc_mean and next_loc in a list,
        #    because you are going to use them in loss computation.
        
    _loc = tf.random_uniform(shape=[batch_size, loc_dim], minval=-1, maxval=1)
    cell = LSTMCell(num_units=cell_dim)
    _state = cell.zero_state(batch_size, dtype=tf.float32)
    
    outputs = []
    loc_means = []
    loc_samples = []
    for i in range(num_glimpses):
        _g = glimpse_net(_loc)
        _h, _state = cell(_g, _state)
        _loc_mean, _next_loc = loc_net(_h)
        _loc = _next_loc
        outputs.append(_h)
        loc_means.append(_loc_mean)
        loc_samples.append(_next_loc)

    return outputs, loc_means, loc_samples
Exemplo n.º 9
0
def stacked_bidirectional_rnn(num_units,
                              num_layers,
                              inputs,
                              seq_lengths,
                              batch_size,
                              is_train,
                              output_keep_prob,
                              reuse=False):
    """

    multi layer bidirectional rnn

    :param num_units: int, hidden unit of RNN cell

    :param num_layers: int, the number of layers

    :param inputs: Tensor, the input sequence, shape: [batch_size, max_time_step, num_feature]

    :param seq_lengths: list or 1-D Tensor, sequence length, a list of sequence lengths, the length of the list is batch_size

    :param batch_size: int

    :return: the output of last layer bidirectional rnn with concatenating

    这里用到几个tf的特性

    1. tf.variable_scope(None, default_name="bidirectional-rnn")使用default_name

    的话,tf会自动处理命名冲突

    """

    # TODO: add time_major parameter, and using batch_size = tf.shape(inputs)[0], and more assert

    _inputs = inputs

    if len(_inputs.get_shape().as_list()) != 3:
        raise ValueError("the inputs must be 3-dimentional Tensor")

    for i in range(num_layers):
        with tf.variable_scope("Layer%d" % i, reuse=reuse):
            rnn_cell_fw = LSTMCell(num_units)
            rnn_cell_bw = LSTMCell(num_units)
            rnn_cell_fw = SwitchableDropoutWrapper(
                rnn_cell_fw, is_train, output_keep_prob=output_keep_prob)

            rnn_cell_bw = SwitchableDropoutWrapper(
                rnn_cell_bw, is_train, output_keep_prob=output_keep_prob)
            initial_state_fw = rnn_cell_fw.zero_state(batch_size,
                                                      dtype=tf.float32)

            initial_state_bw = rnn_cell_bw.zero_state(batch_size,
                                                      dtype=tf.float32)

            (output, state) = tf.nn.bidirectional_dynamic_rnn(rnn_cell_fw,
                                                              rnn_cell_bw,
                                                              _inputs,
                                                              seq_lengths,
                                                              initial_state_fw,
                                                              initial_state_bw,
                                                              dtype=tf.float32)
            _inputs = tf.concat(output, 2)

    return _inputs
    def __init__(self, is_testing):
        super().__init__()
        self.is_testing = is_testing

        print("Preparing data...")
        # Load and encode data (Disk -> Memory), see more details in encode_data()
        # Also see data_loader(), the next processing stage.
        self.train, self.valid, self.test, self.vocab = self.encode_data(bAbI('en-valid-10k'))

        print("Creating graph...")
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            regularizer = layers.l2_regularizer(1e-4)  # regularizer applied to fully-connected network

            # allow_soft_placement=True: if cannot find specific device, allow tf to choose the device
            self.session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            self.global_step = tf.Variable(initial_value=0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4)

            self.facts_ph = tf.placeholder(tf.int32, shape=(None, None))  # (bs*#facts, seq)
            self.facts_pos_ph = tf.placeholder(tf.int32, shape=(None,))  # (bs*#facts, )
            self.question_ph = tf.placeholder(tf.int32, shape=(None, None))  # (bs, seq)
            self.answers_ph = tf.placeholder(tf.int32, shape=(None,))  # (bs, )
            self.edge_indices_ph = tf.placeholder(tf.int32, shape=(None, 2))
            self.fact_segments_ph = tf.placeholder(tf.int32, shape=(None,))
            self.edge_segments_ph = tf.placeholder(tf.int32, shape=(None,))
            self.q_seq_length_ph = tf.placeholder(tf.int32, shape=(None,))
            self.f_seq_length_ph = tf.placeholder(tf.int32, shape=(None,))
            self.task_indices_ph = tf.placeholder(tf.int32, shape=(None,))
            self.edge_keep_prob_ph = tf.placeholder(tf.float32, shape=())
            self.is_training_ph = tf.placeholder(tf.bool)

            # device: CPU:0
            placeholders = [self.facts_ph, self.facts_pos_ph, self.question_ph, self.answers_ph, self.edge_indices_ph,
                            self.fact_segments_ph, self.edge_segments_ph, self.q_seq_length_ph, self.f_seq_length_ph,
                            self.task_indices_ph, self.edge_keep_prob_ph]

            # each element of train_queue is a training batch
            self.train_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='train-queue')
            # each element of train_queue is a validation batch
            self.val_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='val-queue')

            self.train_enqueue_op = self.train_queue.enqueue(placeholders)
            self.train_qsize_op = self.train_queue.size()
            # record the size of the train_queue every batch
            tf.summary.scalar('queues/train', self.train_qsize_op)

            self.val_enqueue_op = self.val_queue.enqueue(placeholders)
            self.val_qsize_op = self.val_queue.size()
            # record the size of the val_queue every batch
            tf.summary.scalar('queues/val', self.val_qsize_op)

            def avg_n(x):
                return tf.reduce_mean(tf.stack(x, axis=0), axis=0)

            towers = []
            with tf.variable_scope(tf.get_variable_scope()):
                for device_nr, device in enumerate(self.devices):
                    with tf.device('/cpu:0'):
                        if self.is_testing:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = placeholders
                        else:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = tf.cond(
                                self.is_training_ph,
                                true_fn=lambda: self.train_queue.dequeue(),
                                false_fn=lambda: self.val_queue.dequeue(),
                            )
                            # device: CPU:0, CPU:0, CPU:0 (In a 3 GPU machine, these placeholders are in triplicate.)
                            vars = (facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph,
                                    edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob)

                            for v, ph in zip(vars, placeholders):
                                v.set_shape(ph.get_shape())
                        # device: CPU:0, CPU:0, CPU:0
                        facts_emb = layers.embed_sequence(facts_ph, self.vocab.size(), self.emb_size,
                                                          scope='word-embeddings')
                        # device: CPU:0, CPU:0, CPU:0
                        questions_emb = layers.embed_sequence(question_ph, self.vocab.size(), self.emb_size,
                                                              scope='word-embeddings', reuse=True)

                    with tf.device(device), tf.name_scope("device-%s" % device_nr):
                        # 4 layers FC
                        def mlp(x, scope, n_hidden):
                            with tf.variable_scope(scope):
                                for i in range(3):
                                    x = layers.fully_connected(x, n_hidden, weights_regularizer=regularizer)
                                return layers.fully_connected(x, n_hidden, weights_regularizer=regularizer,
                                                              activation_fn=None)

                        # get the final hidden state for the sentences(facts), f_encoding shape: (bs*#facts, state_size)
                        _, (_, f_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), facts_emb, dtype=tf.float32,
                                                               sequence_length=f_seq_length_ph, scope='fact-encoder')

                        # shape:(bs, ) (the same as answers_ph), elements inside the vector range from 0 to 20 randomly
                        # and subjects to the normal distribution
                        random_pos_offsets = tf.random_uniform(tf.shape(answers_ph), minval=0, maxval=self.num_facts,
                                                               dtype=tf.int32)
                        # Generate random offset. Note that for a specific task, the offset is the same.
                        fact_pos = facts_pos_ph + tf.gather(random_pos_offsets, fact_segments_ph)
                        # Considering the offset, the depth for the positional one-hot encoding should be 2*num_facts
                        facts_pos_encoding = tf.one_hot(fact_pos, 2 * self.num_facts)

                        # concatenate the encoding of content and position; device: GPU:0, GPU:1, GPU:2
                        f_encoding = tf.concat([f_encoding, facts_pos_encoding], axis=1)

                        # Need not to encode position for questions, just get the features of their content
                        # q_encoding shape: (bs, state_size); device: GPU:0, GPU:1, GPU: 2
                        _, (_, q_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), questions_emb,
                                                               dtype=tf.float32, sequence_length=q_seq_length_ph,
                                                               scope='question-encoder')

                        # MLP of 3 layers FC, used to process the output of a graph
                        # num output of last layer is vocab.size(), so as to get the logits
                        def graph_fn(x):
                            with tf.variable_scope('graph-fn'):
                                x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer)
                                x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer)
                                return layers.fully_connected(x, self.vocab.size(), activation_fn=None,
                                                              weights_regularizer=regularizer)

                        # concatenate the fact_encoding and the question_encoding
                        x = tf.concat([f_encoding, tf.gather(q_encoding, fact_segments_ph)], 1)

                        # x0 represents "fact embedding given the question"
                        # (by concatenate the question embedding with them)
                        # device: GPU:0, GPU:1, GPU:2
                        x0 = mlp(x, 'pre', self.n_hidden)

                        # generate the question encoding for every edge
                        # edge_features shape: (bs*(#facts**2), LSTM state_size)
                        edge_features = tf.gather(q_encoding, edge_segments_ph)

                        x = x0
                        outputs = []
                        log_losses = []
                        with tf.variable_scope('steps'):
                            lstm_cell = LSTMCell(self.n_hidden)
                            state = lstm_cell.zero_state(tf.shape(x)[0], tf.float32)

                            for step in range(self.n_steps):
                                x = message_passing(x, edge_indices_ph, edge_features,
                                                    lambda x: mlp(x, 'message-fn', self.n_hidden), edge_keep_prob)

                                x = mlp(tf.concat([x, x0], axis=1), 'post-fn', self.n_hidden)
                                # x=hidden state, state=<cell state, hidden state>
                                # device: (GPU:0)*5, (GPU:1)*5, (GPU:2)*5 (5 is the time step)
                                x, state = lstm_cell(x, state)
                                with tf.variable_scope('graph-sum'):
                                    # In every step, get the sum of output vectors of Nodes for every task(Graph)
                                    # i.e. graph_sum shape: (bs, n_hidden)
                                    graph_sum = tf.segment_sum(x, fact_segments_ph)
                                    out = graph_fn(graph_sum)  # shape: (bs, vocab_size)
                                    outputs.append(out)
                                    # softmax loss, scalar Tensor
                                    log_loss=tf.reduce_mean(
                                        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=answers_ph, logits=out))
                                    # log_losses is a list of scalar Tensor, each one means the loss in a time step
                                    log_losses.append(log_loss)

                                # reuse the Variables in LSTM across different time step
                                tf.get_variable_scope().reuse_variables()
                        # scalr Tensor, the sum of all regularization term loss
                        reg_loss = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
                        # avg_n(log_losses) gets the mean loss for every step, i.e. "loss" is a scalar Tensor
                        # device: GPU:0, GPU:1, GPU:2
                        loss = avg_n(log_losses) + reg_loss

                        # device: GPU:0, GPU:1, GPU:2
                        stat={
                            'loss': loss,  # scalar Tensor
                            'grads': self.optimizer.compute_gradients(loss),
                            'log_losses': tf.stack(log_losses),  # (n_steps, )
                            'answers': answers_ph,  # (batch_size, )
                            'outputs': tf.stack(outputs),  # (n_steps, batch_size, vocab_size)
                            'task_indices': task_indices_ph  # (batch_size, )
                        }
                        towers.append(stat)
                        print('line 159: ')
                        print('"' + tf.get_variable_scope().name + '"')
                        # reuse the Variables in embedding, encoder, and some MLPs across different device
                        tf.get_variable_scope().reuse_variables()

            # device of the following 4 vars is CPU:0
            self.loss = avg_n([t['loss'] for t in towers])
            self.out = tf.concat([t['outputs'] for t in towers], axis=1)
            self.answers = tf.concat([t['answers'] for t in towers], axis=0)
            self.task_indices = tf.concat([t['task_indices'] for t in towers], axis=0)

            tf.summary.scalar('losses/total', self.loss)
            tf.summary.scalar('losses/reg', reg_loss)
            log_losses = avg_n([t['log_losses'] for t in towers])
            for i in range(self.n_steps):
                tf.summary.scalar('steps/%d/losses/log' % i, log_losses[i])

            avg_gradients = util.average_gradients([t['grads'] for t in towers])

            # global_step increases by 1 after the gradient is updated
            self.train_step = self.optimizer.apply_gradients(avg_gradients, global_step=self.global_step)

            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            util.print_vars(tf.trainable_variables())

            self.train_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/train/%s' % (self.revision, self.name),
                                                      self.session.graph)
            self.test_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/test/%s' % (self.revision, self.name),
                                                     self.session.graph)

            self.summaries = tf.summary.merge_all()

        print("Starting data loaders...")
        train_mp_queue = mp.Manager().Queue(maxsize=self.qsize)
        val_mp_queue = mp.Manager().Queue(maxsize=self.qsize)

        # After loaded data from disk(done in the code `self.encode_data(bAbI('en-valid-10k'))`),
        # use 4+1=5 Processes to construct batches and encode them, then enqueue them onto corresponding queue.
        # see more details in random_batch() and encode_batch()
        data_loader_processes = [mp.Process(target=self.data_loader, args=(train_mp_queue, True)) for i in range(4)]
        val_data_loader_processes = [mp.Process(target=self.data_loader, args=(val_mp_queue, False)) for i in range(1)]

        # start the processes
        for p in data_loader_processes + val_data_loader_processes:
            p.daemon = True
            p.start()

        # Use 2 threads to transfer data from train_mp_queue(val_mp_queue) to train_queue(val_queue).
        # Note that batch in train_mp_queue is ndarray of numpy,
        # and these two thread change every batch into Tensors and enqueue it onto train_queue.
        # see the placeholders defined before for the format of each batch.
        queue_putter_threads = [
            threading.Thread(target=self.queue_putter, args=(train_mp_queue, self.train_enqueue_op, 'train', 1000)),
            threading.Thread(target=self.queue_putter, args=(val_mp_queue, self.val_enqueue_op, 'val', 1)),
        ]
        # start data transferring
        for t in queue_putter_threads:
            t.daemon = True
            t.start()

        train_qsize, val_qsize = 0, 0
        print("Waiting for queue to fill...")
        while train_qsize < self.qsize or val_qsize < self.qsize:
            # update the size of the queues of training and validation
            train_qsize = self.session.run(self.train_qsize_op)
            val_qsize = self.session.run(self.val_qsize_op)
            print('train_qsize: %d, val_qsize: %d' % (train_qsize, val_qsize), flush=True)
            time.sleep(1)
Exemplo n.º 11
0
    def __init__(self, is_testing):
        super().__init__()
        self.is_testing = is_testing
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            regularizer = layers.l2_regularizer(1e-4)
            self.name = "%s %s" % (self.revision, self.message)
            self.train, self.valid, self.test = self.encode_data(sudoku())

            print("Building graph...")
            self.session = tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True))
            self.global_step = tf.Variable(initial_value=0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4)

            self.mode = tf.placeholder(tf.string)

            if self.edges == 'sudoku':
                edges = self.sudoku_edges()
            elif self.edges == 'full':
                edges = [(i, j) for i in range(81) for j in range(81)
                         if not i == j]
            else:
                raise ValueError('edges must be sudoku or full')

            edge_indices = tf.constant([(i + (b * 81), j + (b * 81))
                                        for b in range(self.batch_size)
                                        for i, j in edges], tf.int32)
            n_edges = tf.shape(edge_indices)[0]
            edge_features = tf.zeros((n_edges, 1), tf.float32)
            positions = tf.constant([[(i, j) for i in range(9)
                                      for j in range(9)]
                                     for b in range(self.batch_size)],
                                    tf.int32)  # (bs, 81, 2)
            rows = layers.embed_sequence(positions[:, :, 0],
                                         9,
                                         self.emb_size,
                                         scope='row-embeddings',
                                         unique=True)  # bs, 81, emb_size
            cols = layers.embed_sequence(positions[:, :, 1],
                                         9,
                                         self.emb_size,
                                         scope='cols-embeddings',
                                         unique=True)  # bs, 81, emb_size

            def avg_n(x):
                return tf.reduce_mean(tf.stack(x, axis=0), axis=0)

            towers = []
            with tf.variable_scope(tf.get_variable_scope()):
                for device_nr, device in enumerate(self.devices):
                    with tf.device('/cpu:0'):

                        if self.is_testing:
                            (quizzes, answers
                             ), edge_keep_prob = self.test.get_next(), 1.0
                        else:
                            (quizzes, answers), edge_keep_prob = tf.cond(
                                tf.equal(self.mode, "train"),
                                true_fn=lambda:
                                (self.train.get_next(), self.edge_keep_prob),
                                false_fn=lambda: (self.valid.get_next(), 1.0))

                        x = layers.embed_sequence(
                            quizzes,
                            10,
                            self.emb_size,
                            scope='nr-embeddings',
                            unique=True)  # bs, 81, emb_size
                        x = tf.concat([x, rows, cols], axis=2)
                        x = tf.reshape(x, (-1, 3 * self.emb_size))

                    with tf.device(device), tf.name_scope("device-%s" %
                                                          device_nr):

                        def mlp(x, scope):
                            with tf.variable_scope(scope):
                                for i in range(3):
                                    x = layers.fully_connected(
                                        x,
                                        self.n_hidden,
                                        weights_regularizer=regularizer)
                                return layers.fully_connected(
                                    x,
                                    self.n_hidden,
                                    weights_regularizer=regularizer,
                                    activation_fn=None)

                        x = mlp(x, 'pre-fn')
                        x0 = x
                        n_nodes = tf.shape(x)[0]
                        outputs = []
                        log_losses = []
                        with tf.variable_scope('steps'):
                            lstm_cell = LSTMCell(self.n_hidden)
                            state = lstm_cell.zero_state(n_nodes, tf.float32)

                            for step in range(self.n_steps):
                                x = message_passing(
                                    x, edge_indices, edge_features,
                                    lambda x: mlp(x, 'message-fn'),
                                    edge_keep_prob)
                                x = mlp(tf.concat([x, x0], axis=1), 'post-fn')
                                x, state = lstm_cell(x, state)

                                with tf.variable_scope('graph-sum'):
                                    out = tf.reshape(
                                        layers.fully_connected(
                                            x,
                                            num_outputs=10,
                                            activation_fn=None), (-1, 81, 10))
                                    outputs.append(out)
                                    log_losses.append(
                                        tf.reduce_mean(
                                            tf.nn.
                                            sparse_softmax_cross_entropy_with_logits(
                                                labels=answers, logits=out)))

                                tf.get_variable_scope().reuse_variables()

                        reg_loss = sum(
                            tf.get_collection(
                                tf.GraphKeys.REGULARIZATION_LOSSES))
                        loss = avg_n(log_losses) + reg_loss

                        towers.append({
                            'loss':
                            loss,
                            'grads':
                            self.optimizer.compute_gradients(loss),
                            'log_losses':
                            tf.stack(log_losses),  # (n_steps, 1)
                            'quizzes':
                            quizzes,  # (bs, 81, 10)
                            'answers':
                            answers,  # (bs, 81, 10)
                            'outputs':
                            tf.stack(outputs)  # n_steps, bs, 81, 10
                        })

                        tf.get_variable_scope().reuse_variables()

            self.loss = avg_n([t['loss'] for t in towers])
            self.out = tf.concat([t['outputs'] for t in towers],
                                 axis=1)  # n_steps, bs, 81, 10
            self.predicted = tf.cast(tf.argmax(self.out, axis=3), tf.int32)
            self.answers = tf.concat([t['answers'] for t in towers], axis=0)
            self.quizzes = tf.concat([t['quizzes'] for t in towers], axis=0)

            tf.summary.scalar('losses/total', self.loss)
            tf.summary.scalar('losses/reg', reg_loss)
            log_losses = avg_n([t['log_losses'] for t in towers])

            for step in range(self.n_steps):
                equal = tf.equal(self.answers, self.predicted[step])

                digit_acc = tf.reduce_mean(tf.to_float(equal))
                tf.summary.scalar('steps/%d/digit-acc' % step, digit_acc)

                puzzle_acc = tf.reduce_mean(
                    tf.to_float(tf.reduce_all(equal, axis=1)))
                tf.summary.scalar('steps/%d/puzzle-acc' % step, puzzle_acc)

                tf.summary.scalar('steps/%d/losses/log' % step,
                                  log_losses[step])

            avg_gradients = util.average_gradients(
                [t['grads'] for t in towers])
            self.train_step = self.optimizer.apply_gradients(
                avg_gradients, global_step=self.global_step)

            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            util.print_vars(tf.trainable_variables())

            self.train_writer = tf.summary.FileWriter(
                '/tmp/tensorboard/sudoku/%s/train/%s' %
                (self.revision, self.name), self.session.graph)
            self.test_writer = tf.summary.FileWriter(
                '/tmp/tensorboard/sudoku/%s/test/%s' %
                (self.revision, self.name), self.session.graph)
            self.summaries = tf.summary.merge_all()
Exemplo n.º 12
0
def startLstm(epochs=10, saveResult=True):
	trainData, validData, testData, wordId = loadWordIdsFromFiles()
	trainData = np.array(trainData, np.float32)
	# validData = np.array(validData, np.float32)
	testData = np.array(testData, np.float32)
	vocabSz = len(wordId)

	learnRate = 0.001
	embedSz = 128
	rnnSz, batchSz, winSz = 512, 10, 5
	numWin = (trainData.shape[0] - 1) // (batchSz * winSz)
	# each batch has winSz * numWin words
	batchLen = winSz * numWin

	testNumWin = (testData.shape[0] - 1) // (batchSz * winSz)
	testBatchLen = winSz * testNumWin

	inp = tf.placeholder(tf.int32, shape=[batchSz, winSz])
	# ans = tf.placeholder(tf.int32, shape=[batchSz * winSz])
	ans = tf.placeholder(tf.int32, shape=[batchSz, winSz])

	E = tf.Variable(tf.random_normal([vocabSz, embedSz], stddev=0.1))
	embed = tf.nn.embedding_lookup(E, inp)

	rnn = LSTMCell(rnnSz)
	initialState = rnn.zero_state(batchSz, tf.float32)
	output, nextState = tf.nn.dynamic_rnn(rnn, embed, initial_state=initialState)
	# output = tf.reshape(output, [batchSz * winSz, rnnSz])

	W = tf.Variable(tf.random_normal([rnnSz, vocabSz], stddev=.1))
	B = tf.Variable(tf.random_normal([vocabSz], stddev=.1))
	# logits = tf.matmul(output, W) + B
	logits = tf.tensordot(output, W, [[2], [0]]) + B

	ents = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=ans)
	loss = tf.reduce_sum(ents)
	train = tf.train.GradientDescentOptimizer(learnRate).minimize(loss)

	trainPerp = np.zeros(epochs, dtype=np.float32)
	testPerp = np.zeros(epochs, dtype=np.float32)
	with tf.Session() as sess:
		startTime = time.time()
		sess.run(tf.global_variables_initializer())
		epoch = 0
		print('epoch:', end=' ')
		while epoch < epochs:
			win = 0
			inState = sess.run(initialState)
			testState = sess.run(initialState)
			# print(inState, testState)
			winStart, winEnd = 0, winSz
			while win < numWin:
				inInp = np.array([trainData[i * batchLen + winStart:i * batchLen + winEnd] for i in range(batchSz)])
				# inAns = np.reshape(np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
				inAns = np.array([trainData[i * batchLen + winStart + 1: i * batchLen + winEnd + 1] for i in range(batchSz)])
				_, inState, outLoss = sess.run([train, nextState, loss], {inp: inInp, ans: inAns, nextState: inState})
				trainPerp[epoch] += outLoss
				if win < testNumWin:
					inInp = np.array([testData[i * testBatchLen + winStart:i * testBatchLen + winEnd] for i in range(batchSz)])
					# inAns = np.reshape(np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)]), batchSz * winSz)
					inAns = np.array([testData[i * testBatchLen + winStart + 1: i * testBatchLen + winEnd + 1] for i in range(batchSz)])
					testState, testOutLoss = sess.run([nextState, loss], {inp: inInp, ans: inAns, nextState: testState})
					testPerp[epoch] += testOutLoss
				winStart, winEnd = winEnd, winEnd + winSz
				win += 1
			epoch += 1
			print(epoch, end=' ')
		trainPerp = np.exp(trainPerp / (trainData.shape[0] // (batchSz * batchLen) * (batchSz * batchLen)))
		testPerp = np.exp(testPerp / (testData.shape[0] // (batchSz * testBatchLen) * (batchSz * testBatchLen)))
		print(f'\nelapsed: {time.time() - startTime}')
		print('train perplexity:', trainPerp[-1])
		print('test perplexity:', testPerp[-1])

		info = {'style': 'lstm', 'batch size': batchSz, 'embed size': embedSz, 'rnn size': rnnSz, 'win size': winSz,
		        'learning rate': learnRate, 'epochs': epochs, 'train perplexity': trainPerp[-1], 'test perplexity': testPerp[-1]}
		if saveResult:
			save(sess, info)
	drawPerplexity(trainPerp, testPerp)