示例#1
0
文件: lstm.py 项目: JoyceYa/edward
def lstm_cell(x, h, c, name=None, reuse=False):
  """LSTM returning hidden state and content cell at a specific timestep."""
  nin = x.shape[-1].value
  nout = h.shape[-1].value
  with tf.variable_scope(name, default_name="lstm",
                         values=[x, h, c], reuse=reuse):
    wx = tf.get_variable("kernel/input", [nin, nout * 4],
                         dtype=tf.float32,
                         initializer=tf.orthogonal_initializer(1.0))
    wh = tf.get_variable("kernel/hidden", [nout, nout * 4],
                         dtype=tf.float32,
                         initializer=tf.orthogonal_initializer(1.0))
    b = tf.get_variable("bias", [nout * 4],
                        dtype=tf.float32,
                        initializer=tf.constant_initializer(0.0))

  z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
  i, f, o, u = tf.split(z, 4, axis=1)
  i = tf.sigmoid(i)
  f = tf.sigmoid(f + 1.0)
  o = tf.sigmoid(o)
  u = tf.tanh(u)
  c = f * c + i * u
  h = o * tf.tanh(c)
  return h, c
示例#2
0
 def testGain(self):
   shape = (10, 10)
   for dtype in [tf.float32, tf.float64]:
     init1 = tf.orthogonal_initializer(seed=1, dtype=dtype)
     init2 = tf.orthogonal_initializer(gain=3.14, seed=1, dtype=dtype)
     with self.test_session(graph=tf.Graph(), use_gpu=True):
       t1 = init1(shape).eval()
     with self.test_session(graph=tf.Graph(), use_gpu=True):
       t2 = init2(shape).eval()
     return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
示例#3
0
文件: gan.py 项目: my-wiki/my-demo
 def define_generator(
         self, z, out_dim=2, num_hidden_neuron=256, num_layers=2):
     """inference procedure of generative model."""
     with tf.variable_scope('generator'):
         hidden = z
         for hidden_idx in range(num_layers):
             hidden = fully_connected(
                 hidden, num_hidden_neuron, activation_fn=self.leakyrelu,
                 weights_initializer=tf.orthogonal_initializer(gain=1.4))
         x = fully_connected(
             hidden, out_dim, activation_fn=None,
             weights_initializer=tf.orthogonal_initializer(gain=1.4))
     return x
示例#4
0
def recurrent_layer(tensor, cell=None, hidden_dims=128, sequence_length=None, decoder_fn=None, 
                    activation=tf.nn.tanh, initializer=tf.orthogonal_initializer(), initial_state=None, 
                    keep_prob=1.0,
                    return_final_state=False, return_next_cell_input=True, **opts):
    if cell is None:
        cell = tf.contrib.rnn.BasicRNNCell(hidden_dims, activation=activation)
        # cell = tf.contrib.rnn.LSTMCell(hidden_dims, activation=activation)

    if keep_prob < 1.0:
        keep_prob = _global_keep_prob(keep_prob)
        cell = tf.contrib.rnn.DropoutWrapper(cell, keep_prob, keep_prob)

    if opts.get("name"):
        tf.add_to_collection(opts.get("name"), cell)

    if decoder_fn is None:
        outputs, final_state = tf.nn.dynamic_rnn(cell, tensor, 
            sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32)
        final_context_state = None
    else:
        # TODO: turn off sequence_length?
        outputs, final_state, final_context_state = seq2seq.dynamic_rnn_decoder(
            cell, decoder_fn, inputs=None, sequence_length=sequence_length)

    if return_final_state:
        return final_state
    else:
        return outputs
def make_tf_Linv(layer, V_shape, c_shape, lr, act=tf.nn.tanh):
  """ builds graph for layer-local training of V and c """
  with tf.name_scope('layer'+str(layer)+'_inv') as scope:

    V = tf.get_variable(scope+'V', shape=V_shape, dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95))
    #V = tf.get_variable(scope+'V', shape=V_shape, dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32))
    c = tf.get_variable(scope+'c', shape=c_shape, dtype=tf.float32, initializer=tf.constant_initializer(0.))
    
    W = tf.placeholder(tf.float32, shape=[V_shape[1], V_shape[0]], name='W')
    b = tf.placeholder(tf.float32, shape=[1, V_shape[0]], name='b')
    x_0 = tf.placeholder(tf.float32, shape=[None, V_shape[1]], name='input')
    
    fx = act(tf.matmul(x_0, W) + b)
    loss = 0.5*tf.reduce_mean((act(tf.matmul(fx, V) + c) - x_0)**2, name='loss')  
    
    s1 = tf.summary.scalar('log_loss'+str(layer), tf.log(loss))
    s2 = tf.summary.histogram('V'+str(layer), V)
    s3 = tf.summary.histogram('c'+str(layer), c) 
    
    opt = tf.train.RMSPropOptimizer(lr)
    gvs = opt.compute_gradients(loss, var_list=[V, c])
    sg  = [tf.summary.scalar('norm_grad'+var.name[-3], tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V'
    clipped_gvs = [(tf.clip_by_norm(grad, 100.), var) for grad, var in gvs]
    
    return opt.apply_gradients(clipped_gvs), tf.summary.merge([s1] + sg)
示例#6
0
    def __init__(self, params=params, dyn='FCC'):
        tf.reset_default_graph()

        data = self.sample_mog(params['batch_size'])

        noise = ds.Normal(tf.zeros(params['z_dim']), 
                          tf.ones(params['z_dim'])).sample(params['batch_size'])
        # Construct generator and discriminator nets
        with slim.arg_scope([slim.fully_connected], weights_initializer=tf.orthogonal_initializer(gain=1.4)):
            samples = self.generator(noise, output_dim=params['x_dim'])
            real_score = self.discriminator(data)
            fake_score = self.discriminator(samples, reuse=True)
            
        # Saddle objective    
        loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=real_score, labels=tf.ones_like(real_score)) +
            tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_score, labels=tf.zeros_like(fake_score)))

        gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "generator")
        disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "discriminator")
        gen_shapes = [tuple(v.get_shape().as_list()) for v in gen_vars]
        disc_shapes = [tuple(v.get_shape().as_list()) for v in disc_vars]

        # Generator gradient
        g_opt = tf.train.GradientDescentOptimizer(learning_rate=params['gen_learning_rate'])
        g_grads = g_opt.compute_gradients(-loss, var_list=gen_vars)

        # Discriminator gradient
        d_opt = tf.train.GradientDescentOptimizer(learning_rate=params['disc_learning_rate'])
        d_grads = d_opt.compute_gradients(loss, var_list=disc_vars)

        # Squared Norm of Gradient: d/dx 1/2||F||^2 = J^T F
        grads_norm_sep = [tf.reduce_sum(g[0]**2) for g in g_grads+d_grads]
        grads_norm = 0.5*tf.reduce_sum(grads_norm_sep)

        # Gradient of Squared Norm
        JTF = tf.gradients(grads_norm, xs=gen_vars+disc_vars)

        sess = tf.Session()
        sess.run(tf.global_variables_initializer())

        self.params = params
        self.data = data
        self.samples = samples
        self.gen_vars = gen_vars
        self.disc_vars = disc_vars
        self.gen_shapes = gen_shapes
        self.disc_shapes = disc_shapes
        self.Fg = g_grads
        self.Fd = d_grads
        self.JTF = JTF
        self.sess = sess
        self.findiff_step = params['findiff_step']
        self.gamma = params['gamma']
        self.dyn = dyn

        if dyn == 'FCC':
            self.F = self.FCC
        else:
            self.F = self._F
def conv_layer(inputs, filters, kernel_size, strides, gain=1.0):
    return tf.layers.conv2d(inputs=inputs,
                            filters=filters,
                            kernel_size=kernel_size,
                            strides=(strides, strides),
                            activation=tf.nn.relu,
                            kernel_initializer=tf.orthogonal_initializer(gain=gain))
示例#8
0
def get_variable_initializer(hparams):
  """Get variable initializer from hparams."""
  if not hparams.initializer:
    return None

  mlperf_log.transformer_print(key=mlperf_log.MODEL_HP_INITIALIZER_GAIN,
                               value=hparams.initializer_gain,
                               hparams=hparams)

  if not tf.contrib.eager.in_eager_mode():
    tf.logging.info("Using variable initializer: %s", hparams.initializer)
  if hparams.initializer == "orthogonal":
    return tf.orthogonal_initializer(gain=hparams.initializer_gain)
  elif hparams.initializer == "uniform":
    max_val = 0.1 * hparams.initializer_gain
    return tf.random_uniform_initializer(-max_val, max_val)
  elif hparams.initializer == "normal_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="normal")
  elif hparams.initializer == "uniform_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="uniform")
  elif hparams.initializer == "xavier":
    return tf.contrib.layers.xavier_initializer()
  else:
    raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
示例#9
0
文件: gan.py 项目: my-wiki/my-demo
    def define_discriminator(
            self, x, num_hidden_neuron=256, num_layers=2, reuse=False):
        """inference procedure of adversarial model."""
        # classifies whether x is real (1) or fake (0)
        # with a logistic regression output
        with tf.variable_scope('discriminator') as scope:
            if reuse:
                scope.reuse_variables()

            hidden = x
            for h_idx in range(num_layers):
                hidden = fully_connected(
                    hidden, num_hidden_neuron, activation_fn=self.leakyrelu,
                    weights_initializer=tf.orthogonal_initializer(gain=1.4))
            logit = fully_connected(
                hidden, 1, activation_fn=None,
                weights_initializer=tf.orthogonal_initializer(gain=1.4))
        return logit, tf.nn.sigmoid(logit)
示例#10
0
def define_graph(glove_embeddings_arr):
    """
    Define the tensorflow graph that forms your model. You must use at least
    one recurrent unit. The input placeholder should be of size [batch_size,
    40] as we are restricting each review to it's first 40 words. The
    following naming convention must be used:
        Input placeholder: name="input_data"
        labels placeholder: name="labels"
        accuracy tensor: name="accuracy"
        loss tensor: name="loss"

    RETURN: input placeholder, labels placeholder, dropout_keep_prob, optimizer, accuracy and loss
    tensors"""

    # Input data
    input_data = tf.placeholder(tf.int32,(batch_size, 40),name='input_data') # 50 * 40
    labels = tf.placeholder(tf.float32,(batch_size, 2),name='labels') # 50 * 2

    # Here is the difference !!
    # ****************************************************** #
    dropout_keep_prob = tf.placeholder_with_default(0.5, shape=())
    # keep_prob = tf.placeholder(tf.float32,name='keep_prob')
    # ****************************************************** #

    # Embedding
    embedding = tf.Variable(tf.convert_to_tensor(glove_embeddings_arr, dtype=tf.float32))
    embed = tf.nn.embedding_lookup(embedding,input_data)

    # rnn_cell: here is GRU
    def rnn_cell():
        gru = tf.contrib.rnn.GRUCell(rnn_size)
        drop = tf.contrib.rnn.DropoutWrapper(gru, output_keep_prob = dropout_keep_prob) # YUNQIUXU
        return drop
    
    # single GRU
    with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()):
        cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(rnn_layers)])
        outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32")

    # Attention layer
    attention_output = attention(outputs, attention_size)
    
    # Full connected layer
    W = tf.Variable(tf.truncated_normal([attention_output.get_shape()[1].value, 2], stddev=0.1)) # 128,2
    b = tf.Variable(tf.constant(0., shape=[2])) # 2,
    logits = tf.nn.xw_plus_b(attention_output, W, b)
    logits = tf.squeeze(logits)

    # compute cross entropy
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels), name = "loss")
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), labels), tf.float32), name = "accuracy")

    return input_data, labels, dropout_keep_prob, optimizer, accuracy, loss
示例#11
0
def _get_variable_initializer(hparams):
  if hparams.initializer == "orthogonal":
    return tf.orthogonal_initializer(gain=hparams.initializer_gain)
  elif hparams.initializer == "uniform":
    max_val = 0.1 * hparams.initializer_gain
    return tf.random_uniform_initializer(-max_val, max_val)
  elif hparams.initializer == "normal_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="normal")
  elif hparams.initializer == "uniform_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="uniform")
  else:
    raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
示例#12
0
 def testShapesValues(self):
   for dtype in [tf.float32, tf.float64]:
     for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
       init = tf.orthogonal_initializer(dtype=dtype)
       with self.test_session(graph=tf.Graph(), use_gpu=True):
         # Check the shape
         t = init(shape).eval()
         self.assertAllEqual(shape, t.shape)
         # Check orthogonality by computing the inner product
         t = t.reshape((np.prod(t.shape[:-1]), t.shape[-1]))
         if t.shape[0] > t.shape[1]:
           self.assertAllClose(np.dot(t.T, t), np.eye(t.shape[1]))
         else:
           self.assertAllClose(np.dot(t, t.T), np.eye(t.shape[0]))
示例#13
0
def define_graph(glove_embeddings_arr):

    # Input data
    input_data = tf.placeholder(tf.int32,(batch_size, 40),name='input_data') # 50 * 40
    labels = tf.placeholder(tf.float32,(batch_size, 2),name='labels') # 50 * 2
    keep_prob = tf.placeholder(tf.float32,name='keep_prob')

    # Embedding
    embedding = tf.Variable(tf.convert_to_tensor(glove_embeddings_arr, dtype=tf.float32)) # 注意这里的数据结构
    embed = tf.nn.embedding_lookup(embedding,input_data)

    # lstm_cell: here is GRU
    # def lstm_cell():
    #     lstm = tf.contrib.rnn.GRUCell(lstm_size)
    #     drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob = keep_prob) # YUNQIUXU
    #     return drop
    def rnn_cell():
        gru = tf.contrib.rnn.GRUCell(rnn_size)
        drop = tf.contrib.rnn.DropoutWrapper(gru, output_keep_prob = keep_prob) # YUNQIUXU
        return drop

    with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()):
        cell = tf.contrib.rnn.MultiRNNCell([rnn_cell() for _ in range(rnn_layers)])
        outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32")
    
    # single GRU
    # with tf.variable_scope('init_name', initializer=tf.orthogonal_initializer()):
    #     cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    #     outputs, final_state = tf.nn.dynamic_rnn(cell, embed, dtype = "float32")

    # Attention layer
    attention_output = attention(outputs, attention_size)
    
    # Full connected layer
    W = tf.Variable(tf.truncated_normal([attention_output.get_shape()[1].value, 2], stddev=0.1)) # 128,2
    b = tf.Variable(tf.constant(0., shape=[2])) # 2,
    logits = tf.nn.xw_plus_b(attention_output, W, b)
    logits = tf.squeeze(logits)

    # compute cross entropy
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels), name = "loss")
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.sigmoid(logits)), labels), tf.float32), name = "accuracy")

    return input_data, labels, keep_prob, optimizer, accuracy, loss
示例#14
0
def get_variable_initializer(hparams):
  """Get variable initializer from hparams."""
  if not hparams.initializer:
    return None

  tf.logging.info("Using variable initializer: %s", hparams.initializer)
  if hparams.initializer == "orthogonal":
    return tf.orthogonal_initializer(gain=hparams.initializer_gain)
  elif hparams.initializer == "uniform":
    max_val = 0.1 * hparams.initializer_gain
    return tf.random_uniform_initializer(-max_val, max_val)
  elif hparams.initializer == "normal_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="normal")
  elif hparams.initializer == "uniform_unit_scaling":
    return tf.variance_scaling_initializer(
        hparams.initializer_gain, mode="fan_avg", distribution="uniform")
  else:
    raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
示例#15
0
  def __init__(self, component):
    """Initializes weights and layers.

    Args:
      component: Parent ComponentBuilderBase object.
    """
    super(BiaffineDigraphNetwork, self).__init__(component)

    check.Eq(len(self._fixed_feature_dims.items()), 0,
             'Expected no fixed features')
    check.Eq(len(self._linked_feature_dims.items()), 2,
             'Expected two linked features')

    check.In('sources', self._linked_feature_dims,
             'Missing required linked feature')
    check.In('targets', self._linked_feature_dims,
             'Missing required linked feature')
    self._source_dim = self._linked_feature_dims['sources']
    self._target_dim = self._linked_feature_dims['targets']

    self._weights = []
    self._weights.append(
        tf.get_variable('weights_arc', [self._source_dim, self._target_dim],
                        tf.float32, tf.orthogonal_initializer()))
    self._weights.append(
        tf.get_variable('weights_source', [self._source_dim], tf.float32,
                        tf.zeros_initializer()))
    self._weights.append(
        tf.get_variable('root', [self._source_dim], tf.float32,
                        tf.zeros_initializer()))

    self._params.extend(self._weights)
    self._regularized_weights.extend(self._weights)

    # Add runtime hooks for pre-computed weights.
    self._derived_params.append(self._get_root_weights)
    self._derived_params.append(self._get_root_bias)

    # Negative Layer.dim indicates that the dimension is dynamic.
    self._layers.append(network_units.Layer(component, 'adjacency', -1))
def make_tf_L(layer, W_shape, b_shape, lr, act=tf.nn.tanh):
  """
    TODO: implement initialization as input option
    builds graph for layer-local training of W and b
    args:
      layer (int): which layer
      W_shape:
      b_shape:
      lr: learning rate
      act: activation function
    returns:
      training op
      merged summaries of this layer
  """
  with tf.name_scope('layer'+str(layer)+'_ff') as scope:

    W = tf.get_variable(scope+'W', shape=W_shape, dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95))
    #W = tf.get_variable(scope+'W', shape=W_shape, dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True, seed=None, dtype=tf.float32))
    b = tf.get_variable(scope+'b', shape=b_shape, dtype=tf.float32, initializer=tf.constant_initializer(0.))

    x_0 = tf.placeholder(tf.float32, shape=[None, W_shape[0]], name='input')
    y   = tf.placeholder(tf.float32, shape=[None, W_shape[1]], name='output')
    
    loss = 0.5*tf.reduce_mean((act(tf.matmul(x_0, W) + b) - y)**2, name='loss') 
    
    s1 = tf.summary.scalar('log_loss'+str(layer), tf.log(loss))
    s2 = tf.summary.histogram('W'+str(layer), W)
    s3 = tf.summary.histogram('b'+str(layer), b) 
    
    # opt = tf.train.RMSPropOptimizer(lr) # rmsprop works *way* better than adam for local loss functions. unclear why.
    opt = tf.train.GradientDescentOptimizer(lr) # rmsprop works *way* better than adam for local loss functions. unclear why.
    gvs = opt.compute_gradients(loss, var_list=[W, b])
    sg  = [tf.summary.scalar('norm_grad'+var.name[-3], tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V'
    clipped_gvs = [(tf.clip_by_norm(grad, 100.), var) for grad, var in gvs] # hmmmmmm. clip by norm value?
    
    return opt.apply_gradients(clipped_gvs), tf.summary.merge([s1] + sg)
示例#17
0
 def create_gru_cell(self):
     cell = tf.nn.rnn_cell.GRUCell(params.rnn_units,kernel_initializer=tf.orthogonal_initializer())
     return cell
示例#18
0
 def get_rnn_cell():
     return tf.contrib.rnn.LSTMCell(
         num_units=self.options['rnn_size'],
         state_is_tuple=True,
         initializer=tf.orthogonal_initializer())
示例#19
0
def fc_layer(inputs, units, activation_fn=tf.nn.relu, gain=1.0):
    return tf.layers.dense(inputs=inputs,
                           units=units,
                           activation=activation_fn,
                           kernel_initializer=tf.orthogonal_initializer(gain))
 def cells(reuse=False):
     return tf.nn.rnn_cell.LSTMCell(size_layer, initializer=tf.orthogonal_initializer(), reuse=reuse)
示例#21
0
    def build_train(self):

        # this line of code is just a message to inform that batch size should be set to 1 only
        batch_size = 1

        inputs = {}
        outputs = {}

        #******************** Define Proposal Module ******************#

        ## dim1: batch, dim2: video sequence length, dim3: video feature dimension
        ## video feature sequence

        # forward video feature sequence
        video_feat_fw = tf.placeholder(
            tf.float32, [None, None, self.options['video_feat_dim']],
            name='video_feat_fw')
        inputs['video_feat_fw'] = video_feat_fw

        # backward video feature sequence
        video_feat_bw = tf.placeholder(
            tf.float32, [None, None, self.options['video_feat_dim']],
            name='video_feat_bw')
        inputs['video_feat_bw'] = video_feat_bw

        ## proposal data, densely annotated, in forward direction
        proposal_fw = tf.placeholder(tf.int32,
                                     [None, None, self.options['num_anchors']],
                                     name='proposal_fw')
        inputs['proposal_fw'] = proposal_fw

        ## proposal data, densely annotated, in backward direction
        proposal_bw = tf.placeholder(tf.int32,
                                     [None, None, self.options['num_anchors']],
                                     name='proposal_bw')
        inputs['proposal_bw'] = proposal_bw

        ## proposal to feed into captioning module, i choose high tiou proposals for training captioning module, forward pass
        proposal_caption_fw = tf.placeholder(tf.int32, [None, None],
                                             name='proposal_caption_fw')
        inputs['proposal_caption_fw'] = proposal_caption_fw

        ## proposal to feed into captioning module, i choose high tiou proposals for training captioning module, backward pass
        proposal_caption_bw = tf.placeholder(tf.int32, [None, None],
                                             name='proposal_caption_bw')
        inputs['proposal_caption_bw'] = proposal_caption_bw

        ## weighting for positive/negative labels (solve imbalance data problem)
        proposal_weight = tf.placeholder(tf.float32,
                                         [self.options['num_anchors'], 2],
                                         name='proposal_weight')
        inputs['proposal_weight'] = proposal_weight

        rnn_cell_video_fw = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())
        rnn_cell_video_bw = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())

        if self.options['rnn_drop'] > 0:
            print('using dropout in rnn!')

        rnn_drop = tf.placeholder(tf.float32)
        inputs['rnn_drop'] = rnn_drop

        rnn_cell_video_fw = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_video_fw,
            input_keep_prob=1.0 - rnn_drop,
            output_keep_prob=1.0 - rnn_drop)
        rnn_cell_video_bw = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_video_bw,
            input_keep_prob=1.0 - rnn_drop,
            output_keep_prob=1.0 - rnn_drop)

        with tf.variable_scope('proposal_module') as proposal_scope:
            '''video feature sequence encoding: forward pass
            '''
            with tf.variable_scope('video_encoder_fw') as scope:
                #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1)
                sequence_length = tf.expand_dims(tf.shape(video_feat_fw)[1],
                                                 axis=0)
                initial_state = rnn_cell_video_fw.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                rnn_outputs_fw, _ = tf.nn.dynamic_rnn(
                    cell=rnn_cell_video_fw,
                    inputs=video_feat_fw,
                    sequence_length=sequence_length,
                    initial_state=initial_state,
                    dtype=tf.float32)

            rnn_outputs_fw_reshape = tf.reshape(rnn_outputs_fw,
                                                [-1, self.options['rnn_size']],
                                                name='rnn_outputs_fw_reshape')

            # predict proposal at each time step: use fully connected layer to output scores for every anchors
            with tf.variable_scope('predict_proposal_fw') as scope:
                logit_output_fw = tf.contrib.layers.fully_connected(
                    inputs=rnn_outputs_fw_reshape,
                    num_outputs=self.options['num_anchors'],
                    activation_fn=None)
            '''video feature sequence encoding: backward pass
            '''
            with tf.variable_scope('video_encoder_bw') as scope:
                #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1)
                sequence_length = tf.expand_dims(tf.shape(video_feat_bw)[1],
                                                 axis=0)
                initial_state = rnn_cell_video_bw.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                rnn_outputs_bw, _ = tf.nn.dynamic_rnn(
                    cell=rnn_cell_video_bw,
                    inputs=video_feat_bw,
                    sequence_length=sequence_length,
                    initial_state=initial_state,
                    dtype=tf.float32)

            rnn_outputs_bw_reshape = tf.reshape(rnn_outputs_bw,
                                                [-1, self.options['rnn_size']],
                                                name='rnn_outputs_bw_reshape')

            # predict proposal at each time step: use fully connected layer to output scores for every anchors
            with tf.variable_scope('predict_proposal_bw') as scope:
                logit_output_bw = tf.contrib.layers.fully_connected(
                    inputs=rnn_outputs_bw_reshape,
                    num_outputs=self.options['num_anchors'],
                    activation_fn=None)

        # calculate multi-label loss: use weighted binary cross entropy objective
        proposal_fw_reshape = tf.reshape(proposal_fw,
                                         [-1, self.options['num_anchors']],
                                         name='proposal_fw_reshape')
        proposal_fw_float = tf.to_float(proposal_fw_reshape)
        proposal_bw_reshape = tf.reshape(proposal_bw,
                                         [-1, self.options['num_anchors']],
                                         name='proposal_bw_reshape')
        proposal_bw_float = tf.to_float(proposal_bw_reshape)

        # weighting positive samples
        weight0 = tf.reshape(proposal_weight[:, 0],
                             [-1, self.options['num_anchors']])
        # weighting negative samples
        weight1 = tf.reshape(proposal_weight[:, 1],
                             [-1, self.options['num_anchors']])

        # tile weight batch_size times
        weight0 = tf.tile(weight0, [tf.shape(logit_output_fw)[0], 1])
        weight1 = tf.tile(weight1, [tf.shape(logit_output_fw)[0], 1])

        # get weighted sigmoid xentropy loss
        loss_term_fw = tf.nn.weighted_cross_entropy_with_logits(
            targets=proposal_fw_float,
            logits=logit_output_fw,
            pos_weight=weight0)
        loss_term_bw = tf.nn.weighted_cross_entropy_with_logits(
            targets=proposal_bw_float,
            logits=logit_output_bw,
            pos_weight=weight0)

        loss_term_fw_sum = tf.reduce_sum(loss_term_fw,
                                         axis=-1,
                                         name='loss_term_fw_sum')
        loss_term_bw_sum = tf.reduce_sum(loss_term_bw,
                                         axis=-1,
                                         name='loss_term_bw_sum')

        proposal_fw_loss = tf.reduce_sum(loss_term_fw_sum) / (
            float(self.options['num_anchors']) *
            tf.to_float(tf.shape(video_feat_fw)[1]))
        proposal_bw_loss = tf.reduce_sum(loss_term_bw_sum) / (
            float(self.options['num_anchors']) *
            tf.to_float(tf.shape(video_feat_bw)[1]))
        proposal_loss = (proposal_fw_loss + proposal_bw_loss) / 2.

        # summary data, for visualization using Tensorboard
        tf.summary.scalar('proposal_fw_loss', proposal_fw_loss)
        tf.summary.scalar('proposal_bw_loss', proposal_bw_loss)
        tf.summary.scalar('proposal_loss', proposal_loss)

        # outputs from proposal module
        outputs['proposal_fw_loss'] = proposal_fw_loss
        outputs['proposal_bw_loss'] = proposal_bw_loss
        outputs['proposal_loss'] = proposal_loss

        #*************** Define Captioning Module *****************#

        ## caption data: densely annotate sentences for each time step of a video, use mask data to mask out time steps when no caption should be output
        caption = tf.placeholder(tf.int32,
                                 [None, None, self.options['caption_seq_len']],
                                 name='caption')
        caption_mask = tf.placeholder(
            tf.int32, [None, None, self.options['caption_seq_len']],
            name='caption_mask')
        inputs['caption'] = caption
        inputs['caption_mask'] = caption_mask

        proposal_caption_fw_reshape = tf.reshape(
            proposal_caption_fw, [-1], name='proposal_caption_fw_reshape')
        proposal_caption_bw_reshape = tf.reshape(
            proposal_caption_bw, [-1], name='proposal_caption_bw_reshape')

        # use correct or 'nearly correct' proposal output as input to the captioning module
        boolean_mask = tf.greater(proposal_caption_fw_reshape,
                                  0,
                                  name='boolean_mask')

        # guarantee that at least one pos has True value
        boolean_mask = tf.cond(
            tf.equal(tf.reduce_sum(tf.to_int32(boolean_mask)), 0), lambda: tf.
            concat([boolean_mask[:-1], tf.constant([True])], axis=-1),
            lambda: boolean_mask)

        # select input video state
        feat_len = tf.shape(video_feat_fw)[1]
        forward_indices = tf.boolean_mask(tf.range(feat_len), boolean_mask)
        event_feats_fw = tf.boolean_mask(rnn_outputs_fw_reshape, boolean_mask)
        backward_indices = tf.boolean_mask(proposal_caption_bw_reshape,
                                           boolean_mask)
        event_feats_bw = tf.gather_nd(
            rnn_outputs_bw_reshape, tf.expand_dims(backward_indices, axis=-1))

        start_ids = feat_len - 1 - backward_indices
        end_ids = forward_indices

        event_c3d_seq, _ = self.get_c3d_seq(video_feat_fw[0], start_ids,
                                            end_ids,
                                            self.options['max_proposal_len'])
        context_feats_fw = tf.gather_nd(rnn_outputs_fw_reshape,
                                        tf.expand_dims(start_ids, axis=-1))
        context_feats_bw = tf.gather_nd(
            rnn_outputs_bw_reshape,
            tf.expand_dims(feat_len - 1 - end_ids, axis=-1))

        # proposal feature sequences
        proposal_feats = event_c3d_seq

        # corresponding caption ground truth (batch size  = 1)
        caption_proposed = tf.boolean_mask(caption[0],
                                           boolean_mask,
                                           name='caption_proposed')
        caption_mask_proposed = tf.boolean_mask(caption_mask[0],
                                                boolean_mask,
                                                name='caption_mask_proposed')

        # the number of proposal-caption pairs for training
        n_proposals = tf.shape(caption_proposed)[0]

        rnn_cell_caption = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())

        rnn_cell_caption = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_caption,
            input_keep_prob=1.0 - rnn_drop,
            output_keep_prob=1.0 - rnn_drop)

        def get_rnn_cell():
            return tf.contrib.rnn.LSTMCell(
                num_units=self.options['rnn_size'],
                state_is_tuple=True,
                initializer=tf.orthogonal_initializer())

        # multi-layer LSTM
        multi_rnn_cell_caption = tf.contrib.rnn.MultiRNNCell(
            [get_rnn_cell() for _ in range(self.options['num_rnn_layers'])],
            state_is_tuple=True)

        caption_loss = 0
        with tf.variable_scope('caption_module') as caption_scope:

            batch_size = n_proposals

            # initialize memory cell and hidden output, note that the returned state is a tuple containing all states for each cell in MultiRNNCell
            state = multi_rnn_cell_caption.zero_state(batch_size=batch_size,
                                                      dtype=tf.float32)

            proposal_feats_reshape = tf.reshape(
                proposal_feats, [-1, self.options['video_feat_dim']],
                name='proposal_feats_reshape')

            event_hidden_feats = tf.concat([event_feats_fw, event_feats_bw],
                                           axis=-1)

            event_hidden_feats_tile = tf.tile(
                event_hidden_feats, [1, self.options['max_proposal_len']])
            event_hidden_feats_reshape = tf.reshape(
                event_hidden_feats_tile, [-1, 2 * self.options['rnn_size']])
            ''' 
            The caption data should be prepared in equal length, namely, with length of 'caption_seq_len'
            ## use caption mask data to mask out loss from sequence after end of token (<END>)
            Only the first loop create variable, the other loops reuse them
            '''
            for i in range(self.options['caption_seq_len'] - 1):

                if i > 0:
                    caption_scope.reuse_variables()

                # word embedding
                word_embed = self.build_caption_embedding(caption_proposed[:,
                                                                           i])

                # calculate attention over proposal feature elements
                # state[:, 1] return all hidden states for all cells in MultiRNNCell
                h_state = tf.concat([s[1] for s in state], axis=-1)
                h_state_tile = tf.tile(h_state,
                                       [1, self.options['max_proposal_len']])
                h_state_reshape = tf.reshape(h_state_tile, [
                    -1,
                    self.options['num_rnn_layers'] * self.options['rnn_size']
                ])

                feat_state_concat = tf.concat([
                    proposal_feats_reshape, h_state_reshape,
                    event_hidden_feats_reshape
                ],
                                              axis=-1,
                                              name='feat_state_concat')
                #feat_state_concat = tf.concat([tf.reshape(tf.tile(word_embed, [1, self.options['max_proposal_len']]), [-1, self.options['word_embed_size']]), proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape], axis=-1, name='feat_state_concat')

                # use a two-layer network to model attention over video feature sequence when predicting next word (dynamic)
                with tf.variable_scope('attention') as attention_scope:
                    attention_layer1 = tf.contrib.layers.fully_connected(
                        inputs=feat_state_concat,
                        num_outputs=self.options['attention_hidden_size'],
                        activation_fn=tf.nn.tanh,
                        weights_initializer=tf.contrib.layers.
                        xavier_initializer())
                    attention_layer2 = tf.contrib.layers.fully_connected(
                        inputs=attention_layer1,
                        num_outputs=1,
                        activation_fn=None,
                        weights_initializer=tf.contrib.layers.
                        xavier_initializer())

                # reshape to match
                attention_reshape = tf.reshape(
                    attention_layer2, [-1, self.options['max_proposal_len']],
                    name='attention_reshape')
                attention_score = tf.nn.softmax(attention_reshape,
                                                dim=-1,
                                                name='attention_score')
                attention = tf.reshape(
                    attention_score, [-1, 1, self.options['max_proposal_len']],
                    name='attention')

                # attended video feature
                attended_proposal_feat = tf.matmul(
                    attention, proposal_feats, name='attended_proposal_feat')
                attended_proposal_feat_reshape = tf.reshape(
                    attended_proposal_feat,
                    [-1, self.options['video_feat_dim']],
                    name='attended_proposal_feat_reshape')

                if self.options['no_context']:
                    proposal_feats_full = attended_proposal_feat_reshape
                else:
                    if self.options['context_gating']:
                        # model a gate to weight each element of context and feature
                        attended_proposal_feat_reshape = tf.nn.tanh(
                            attended_proposal_feat_reshape)
                        with tf.variable_scope('context_gating'):
                            '''
                            context_feats_transform = tf.contrib.layers.fully_connected(
                                inputs=event_hidden_feats,
                                num_outputs=self.options['video_feat_dim'],
                                activation_fn=None,
                                weights_initializer=tf.contrib.layers.xavier_initializer()
                            )
                            '''

                            context_feats_transform = event_hidden_feats

                            proposal_feats_transform = tf.contrib.layers.fully_connected(
                                inputs=attended_proposal_feat_reshape,
                                num_outputs=2 * self.options['rnn_size'],
                                activation_fn=tf.nn.tanh,
                                weights_initializer=tf.contrib.layers.
                                xavier_initializer())

                            # context gating
                            gate = tf.contrib.layers.fully_connected(
                                inputs=tf.concat([
                                    word_embed, h_state,
                                    context_feats_transform,
                                    proposal_feats_transform
                                ],
                                                 axis=-1),
                                num_outputs=2 * self.options['rnn_size'],
                                activation_fn=tf.nn.sigmoid,
                                weights_initializer=tf.contrib.layers.
                                xavier_initializer())
                            gated_context_feats = tf.multiply(
                                context_feats_transform, gate)
                            gated_proposal_feats = tf.multiply(
                                proposal_feats_transform, 1. - gate)
                            proposal_feats_full = tf.concat(
                                [gated_context_feats, gated_proposal_feats],
                                axis=-1)

                    else:
                        proposal_feats_full = tf.concat([
                            event_hidden_feats, attended_proposal_feat_reshape
                        ],
                                                        axis=-1)

                # proposal feature embedded into word space
                proposal_feat_embed = self.build_video_feat_embedding(
                    proposal_feats_full)

                # get next state
                caption_output, state = multi_rnn_cell_caption(
                    tf.concat([proposal_feat_embed, word_embed], axis=-1),
                    state)

                # predict next word
                with tf.variable_scope('logits') as logits_scope:
                    logits = tf.contrib.layers.fully_connected(
                        inputs=caption_output,
                        num_outputs=self.options['vocab_size'],
                        activation_fn=None)

                labels = caption_proposed[:, i + 1]  # predict next word

                # loss term
                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=logits, labels=labels)
                output_mask = tf.to_float(caption_mask_proposed[:, i])
                loss = tf.reduce_sum(tf.multiply(loss, output_mask))

                caption_loss = caption_loss + loss

        # mean loss for each word
        caption_loss = caption_loss / (tf.to_float(batch_size) * tf.to_float(
            tf.reduce_sum(caption_mask_proposed)) + 1)

        tf.summary.scalar('caption_loss', caption_loss)
        reg_loss = tf.add_n([
            tf.nn.l2_loss(v) for v in tf.trainable_variables()
            if not v.name.startswith('caption_module/word_embed')
        ])
        total_loss = self.options[
            'weight_proposal'] * proposal_loss + self.options[
                'weight_caption'] * caption_loss
        tf.summary.scalar('total_loss', total_loss)

        outputs['caption_loss'] = caption_loss
        outputs['loss'] = total_loss
        outputs['reg_loss'] = reg_loss
        outputs['n_proposals'] = n_proposals

        return inputs, outputs
示例#22
0
 def conv(inputs, nf, ks, strides, gain=1.0):
     return tf.layers.conv2d(inputs=inputs, filters=nf, kernel_size=ks,
                             strides=(strides, strides), activation=tf.nn.relu,
                             kernel_initializer=tf.orthogonal_initializer(gain=gain),
                             name="enc_net_layer%s" % (layer_count),
                             reuse=tf.AUTO_REUSE)
示例#23
0
def weight_variable(shape,
                    name,
                    init_method=None,
                    dtype=tf.float32,
                    init_para=None,
                    seed=1234,
                    trainable=True):
    """ @brief:
            Initialize weights

        @input:
            shape: list of int, shape of the weights
            init_method: string, indicates initialization method
            init_para: a dictionary,
            init_val: if it is not None, it should be a tensor

        @output:
            var: a TensorFlow Variable
    """

    if init_method is None or init_method == 'zero':
        initializer = tf.zeros_initializer(shape, dtype=dtype)

    if init_method == "normc":
        var = normc_initializer(shape,
                                stddev=init_para['stddev'],
                                seed=seed,
                                dtype=dtype)
        return tf.get_variable(initializer=var, name=name, trainable=trainable)

    elif init_method == "normal":
        initializer = tf.random_normal_initializer(mean=init_para["mean"],
                                                   stddev=init_para["stddev"],
                                                   seed=seed,
                                                   dtype=dtype)

    elif init_method == "truncated_normal":
        initializer = tf.truncated_normal_initializer(
            mean=init_para["mean"],
            stddev=init_para["stddev"],
            seed=seed,
            dtype=dtype)

    elif init_method == "uniform":
        initializer = tf.random_uniform_initializer(minval=init_para["minval"],
                                                    maxval=init_para["maxval"],
                                                    seed=seed,
                                                    dtype=dtype)

    elif init_method == "constant":
        initializer = tf.constant_initializer(value=init_para["val"],
                                              dtype=dtype)

    elif init_method == "xavier":
        initializer = tf.contrib.layers.xavier_initializer(
            uniform=init_para['uniform'], seed=seed, dtype=dtype)

    elif init_method == 'orthogonal':
        initializer = tf.orthogonal_initializer(gain=1.0,
                                                seed=seed,
                                                dtype=dtype)

    else:
        raise ValueError("Unsupported initialization method!")

    var = tf.get_variable(initializer=initializer(shape),
                          name=name,
                          trainable=trainable)

    return var
示例#24
0
 def lstm_cell():
     return LSTMCell(n_hidden, initializer=tf.orthogonal_initializer())
示例#25
0
 def lstm_cell(self):
     return tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer())
示例#26
0
def train_rnn(monkey,
              beta0=0.0,
              beta1=0.0,
              beta2=0.0,
              stddev_state=0.0,
              stddev_out=0.0,
              activation='tanh',
              rnn_init='default',
              num_neurons=100,
              learning_rate=0.0001,
              num_iters=2000,
              save_model_path='./saves/',
              tb_path='./tensorboard/',
              load_prev=False,
              load_model_path=None):
  """
    monkey: 'D' or 'C'
    beta1: regularization hyperparameter for l2_loss(A)
    beta2: regularization hyperparameter for l2_loss(C)
    stddev_state: stddev of injected noise in state variable
    stddev_out: stddev of injected noise in output
    activation: nonlinearity for the RNN. use lambda x: x for linear.
    num_neurons: state dimension
    learning_rate: learning rate for Adam
    num_iters: training iterations
    load_prev: whether or not to load the previous TF variables
    save_model_path: where to save the TF model using tf.train.Saver()
    load_model_path: If load_prev=True, where to load the previous model
    tb_path: tensorboard path
    local_machine: is this a local machine or cluster run?
  """

  # TODO: just load *_preprocessed.mat data.
  if monkey == 'D':
    try:
      data = sio.loadmat('./drakeFeb.mat') #TODO: fix, '../' or './' depending on whether running from wrapper or not
    except:
      data = sio.loadmat('../drakeFeb.mat')
  elif monkey == 'C':
    try:
      data = sio.loadmat('./cousFeb.mat')
    except:
      data = sio.loadmat('../cousFeb.mat')

  # Set activation
  if activation == 'tanh':
    activation = tf.tanh
  elif activation == 'linear':
    activation = tf.identity
  elif activation == 'softplus':
    activation = tf.nn.softplus

  # Preprocess data
  emg = preprocess_array(data['D'][0, 0]['EMG'])
  time_axis, time_inds1, time_inds2 = get_time_axis(data['D'][0, 0]['KIN'])
  y_data1 = emg[time_axis]
  p = y_data1.shape[-1]

  # Build inputs
  m = 2
  u_data1 = create_input_array(y_data1.shape)

  # Augmented data
  # For regularizing the network -- it must fit actual and augmented data
  period = int(np.round(np.diff(time_inds2).mean()))
  y_cat1 = augmented_data(emg, time_inds1, period=period, tiles=10)
  y_cat1 = y_cat1[::25]
  y_cat2 = augmented_data(emg, time_inds2, period=period, tiles=10)
  y_cat2 = y_cat2[::25]

  u_cat1 = create_input_array(y_cat1.shape)
  u_cat2 = create_input_array(y_cat2.shape)

  sequence_length = [y_data1.shape[0], y_cat1.shape[0], y_cat2.shape[0]]

  y_data = np.zeros((np.max(sequence_length), 4*3, p))
  u_data = np.zeros((np.max(sequence_length), 4*3, m))

  y_data[:sequence_length[0], 0:4, :] = y_data1
  y_data[:sequence_length[1], 4:8, :] = y_cat1
  y_data[:sequence_length[2], 8:12, :] = y_cat2

  u_data[:sequence_length[0], 0:4, :] = u_data1
  u_data[:sequence_length[1], 4:8, :] = u_cat1
  u_data[:sequence_length[2], 8:12, :] = u_cat2

  total_data_points = np.sum([v*4 for v in sequence_length])

  # Tensorflow graph
  tf.reset_default_graph()
  #tf.set_random_seed(1234)

  n = num_neurons
  batch_size = y_data.shape[1]

  x0 = tf.Variable(tf.random_normal([batch_size, n], stddev=0.1), name='x0')

  C = tf.get_variable('C', shape=[n, p], initializer=tf.contrib.layers.xavier_initializer())
  #C = tf.Variable(tf.random_normal([n, p], stddev=1/np.sqrt(n)), name='C')
  d = tf.get_variable('d', shape=[1, p], initializer=tf.constant_initializer(0))
  #d = tf.Variable(tf.constant(0.01, shape=[1, p]), name='d')

  U = tf.placeholder(tf.float32, [u_data.shape[0], batch_size, m], name='U')
  Y = tf.placeholder(tf.float32, [y_data.shape[0], batch_size, p], name='Y')

  noise_state = tf.placeholder(tf.float32, name='stddev_state')

  time_steps = tf.shape(U)[0]

  # set initializer for rnn matrix
  if rnn_init == 'orth':
    rnn_initializer = tf.orthogonal_initializer(0.95)
  elif rnn_init == 'xavier':
    rnn_initializer = tf.contrib.layers.xavier_initializer()
  elif rnn_init == 'normal':
    rnn_initializer = tf.random_normal_initializer(1/np.sqrt(n))
  elif rnn_init == 'default':
    rnn_init = None # assign to rnn_init not rnn_initializer. 

  # get a tf var scope to set the rnn initializer.
  if rnn_init is not None: 
    with tf.variable_scope('RNN', initializer=rnn_initializer) as scope:
      pass
  else:
    scope=None

  #cell = tf.nn.rnn_cell.BasicRNNCell(n, activation=activation)
  cell = BasicRNNCellNoise(n, activation=activation, stddev=noise_state)  
  output, state = tf.nn.dynamic_rnn(cell, U, sequence_length=4*[sequence_length[0]]+4*[sequence_length[1]]+4*[sequence_length[2]], initial_state=x0, dtype=tf.float32, time_major=True, scope=scope)

  Y_hat = tf.reshape(output, (time_steps*batch_size, n))
  Y_hat = tf.matmul(Y_hat, C) + d
  Y_hat = tf.reshape(Y_hat, (time_steps, batch_size, p), name='Y_hat')

  # Get RNN variables
  with tf.variable_scope('RNN/BasicRNNCellNoise/Linear', reuse=True):
    Mat = tf.get_variable('Matrix') #note: calling an initializer here will not give it a new one. 
    A = tf.gather(tf.get_variable('Matrix'), range(m, m+n))
    B = tf.gather(tf.get_variable('Matrix'), range(0, m))
    b = tf.get_variable('Bias')

  # Training ops
  # take L2 loss only over data points. note that dynamic_rnn zeros out output, but not y_hat because we have the bias vector d
  cost_term0 = tf.reduce_sum((output[:sequence_length[0], :4, :])**2)
  cost_term0 += tf.reduce_sum((output[:sequence_length[1], 4:8, :])**2)
  cost_term0 += tf.reduce_sum((output[:sequence_length[2], 8:, :])**2)
  cost_term0 = beta0*0.5*cost_term0/total_data_points

  cost_term1 = tf.reduce_sum((Y_hat[:sequence_length[0], :4, :] - Y[:sequence_length[0], :4, :])**2)
  cost_term1 += tf.reduce_sum((Y_hat[:sequence_length[1], 4:8, :] - Y[:sequence_length[1], 4:8, :])**2)
  cost_term1 += tf.reduce_sum((Y_hat[:sequence_length[2], 8:, :] - Y[:sequence_length[2], 8:, :])**2)
  cost_term1 = 0.5*cost_term1/total_data_points

  cost_term2 = beta1*tf.nn.l2_loss(A)
  cost_term3 = beta2*tf.nn.l2_loss(C)
  cost = cost_term0 + cost_term1 + cost_term2 + cost_term3

  train_op = tf.train.AdamOptimizer(learning_rate=learning_rate)
  gvs = train_op.compute_gradients(cost)
  sg = [tf.summary.scalar('norm_grad'+var.name[:-2], 2*tf.nn.l2_loss(grad)) for grad, var in gvs] # var.name = 'namescope/V:0' and we want just 'V'
  clipped_gvs = [(tf.clip_by_norm(grad, 100000.), var) for grad, var in gvs]
  sg_clip = [tf.summary.scalar('norm_grad_clipped'+var.name[:-2], 2*tf.nn.l2_loss(grad)) for grad, var in clipped_gvs] # var.name = 'namescope/V:0' and we want just 'V'

  opt_op = train_op.apply_gradients(clipped_gvs)

  # Summary ops
  tf.summary.scalar('log_loss', tf.log(cost))
  tf.summary.scalar('log_cost0', tf.log(cost_term0))  
  tf.summary.scalar('log_cost1', tf.log(cost_term1))  
  tf.summary.scalar('log_cost2', tf.log(cost_term2))  
  tf.summary.scalar('log_cost3', tf.log(cost_term3))  

  merged_summary_op = tf.summary.merge_all()

  # Saver ops
  saver = tf.train.Saver()

  # Train
  with tf.Session() as sess:
    summary_writer = tf.summary.FileWriter(tb_path)
    sess.run(tf.global_variables_initializer())
    # TODO: fix restore. new tf version saves files differently?
    if load_prev and os.path.exists(load_model_path):
      saver.restore(sess, load_model_path)

    for i in range(num_iters):
      feed_dict = {Y: y_data + np.random.randn(*y_data.shape)*y_data.var()*stddev_out,
                   U: u_data,
                   noise_state: stddev_state}
      _, loss_val, summary_str = sess.run([opt_op, cost, merged_summary_op], feed_dict=feed_dict)

      if i % 200 == 0: 
        summary_writer.add_summary(summary_str, i)

      if i % 1000 == 0:
        print '  iter:', '%04d' % (i), \
              '  Loss:', '{:.6f}'.format(loss_val)

    print '  iter:', '%04d' % (num_iters), \
          '  Loss:', '{:.6f}'.format(loss_val)
    saver.save(sess, save_model_path)

    print '  Finished'

    # Simulate
    y_tf, x_tf = sess.run([Y_hat, output], feed_dict=feed_dict)

    summary_writer.close()

  return y_tf, x_tf
示例#27
0
    def _attention_step(self, doc):

        words_per_line = tf.math.count_nonzero(doc, 1)
        num_lines = tf.math.count_nonzero(words_per_line)
        max_words_ = tf.reduce_max(words_per_line)
        doc_input_reduced = doc[:num_lines, :max_words_]
        num_words = words_per_line[:num_lines]

        #word embeddings
        word_embeds = tf.gather(
            tf.get_variable('embeddings',
                            initializer=self.embedding_matrix,
                            dtype=tf.float32), doc_input_reduced)
        word_embeds = tf.nn.dropout(word_embeds, self.dropout)

        #masking
        mask_base = tf.cast(tf.sequence_mask(num_words, max_words_),
                            tf.float32)
        mask = tf.tile(tf.expand_dims(mask_base, 2),
                       [1, 1, self.attention_size])
        mask2 = tf.tile(tf.expand_dims(mask_base, 2),
                        [self.attention_heads, 1, max_words_])

        #word self attention
        Q = tf.layers.conv1d(
            word_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        K = tf.layers.conv1d(
            word_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        V = tf.layers.conv1d(
            word_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())

        Q = tf.where(tf.equal(mask, 0), tf.zeros_like(Q), Q)
        K = tf.where(tf.equal(mask, 0), tf.zeros_like(K), K)
        V = tf.where(tf.equal(mask, 0), tf.zeros_like(V), V)

        Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(K, self.attention_heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(V, self.attention_heads, axis=2), axis=0)

        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)
        outputs = tf.where(tf.equal(outputs, 0),
                           tf.ones_like(outputs) * -1000, outputs)
        outputs = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout)
        word_self = tf.where(tf.equal(mask2, 0), tf.zeros_like(outputs),
                             outputs)
        outputs = tf.matmul(word_self, V_)
        outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0),
                            axis=2)
        outputs = tf.where(tf.equal(mask, 0), tf.zeros_like(outputs), outputs)

        #word target attention
        Q = tf.get_variable('word_Q', (1, 1, self.attention_size), tf.float32,
                            tf.orthogonal_initializer())
        Q = tf.tile(Q, [num_lines, 1, 1])

        Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0)

        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)
        outputs = tf.where(tf.equal(outputs, 0),
                           tf.ones_like(outputs) * -1000, outputs)
        word_target = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout)
        outputs = tf.matmul(word_target, V_)
        outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0),
                            axis=2)
        sent_embeds = tf.transpose(outputs, [1, 0, 2])
        sent_embeds = tf.nn.dropout(sent_embeds, self.dropout)

        #sent self attention
        Q = tf.layers.conv1d(
            sent_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        K = tf.layers.conv1d(
            sent_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        V = tf.layers.conv1d(
            sent_embeds,
            self.attention_size,
            1,
            padding='same',
            activation=self.activation,
            kernel_initializer=tf.contrib.layers.xavier_initializer())

        Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(K, self.attention_heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(V, self.attention_heads, axis=2), axis=0)

        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)
        sent_self = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout)
        outputs = tf.matmul(sent_self, V_)
        outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0),
                            axis=2)

        #sent target attention
        Q = tf.get_variable('sent_Q', (1, 1, self.attention_size), tf.float32,
                            tf.orthogonal_initializer())

        Q_ = tf.concat(tf.split(Q, self.attention_heads, axis=2), axis=0)
        K_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0)
        V_ = tf.concat(tf.split(outputs, self.attention_heads, axis=2), axis=0)

        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)
        sent_target = tf.nn.dropout(tf.nn.softmax(outputs), self.dropout)
        outputs = tf.matmul(sent_target, V_)
        outputs = tf.concat(tf.split(outputs, self.attention_heads, axis=0),
                            axis=2)
        doc_embed = tf.nn.dropout(tf.squeeze(outputs, [0]), self.dropout)
        doc_embed = tf.squeeze(doc_embed, [0])

        return doc_embed
示例#28
0
    def bulid_train(self, network, value_network=None):
        self.advantage = tf.placeholder(tf.float32, [None], name="Advantage")
        self.old_value = tf.placeholder(tf.float32, [None], name="Old_value")
        self.returns = tf.placeholder(tf.float32, [None], name="Returns")
        self.returns_in = tf.placeholder(tf.float32, [None],
                                         name="Returns_intrinsic")
        self.prevneglogp = tf.placeholder(tf.float32, [None], name="Old_pi_a")
        self.lr = tf.placeholder(tf.float32, [], name="Learning_rate")

        if value_network == None:
            self.value = tf.layers.dense(
                network,
                1,
                kernel_initializer=tf.orthogonal_initializer(),
                name="Value")
        else:
            self.value = tf.layers.dense(
                value_network,
                1,
                kernel_initializer=tf.orthogonal_initializer(),
                name="Value")
        self.value = self.value[:, 0]
        self.value_in = self.value_in[:, 0]

        with tf.variable_scope('Actor_loss'):
            pi_a = self.action.neglogp(self.actions)
            ratio = tf.exp(self.prevneglogp - pi_a)
            actor_loss = ratio * -self.advantage
            clipped_loss = tf.clip_by_value(ratio, 1 - self.epsilon,
                                            1 + self.epsilon) * -self.advantage
            self.actor_loss = tf.reduce_mean(
                tf.maximum(actor_loss, clipped_loss))
            self.clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0), self.epsilon)))

        with tf.variable_scope('Entropy'):
            self.entropy = tf.reduce_mean(self.action.entropy())
        with tf.variable_scope('Critic_loss'):
            critic_loss1 = tf.squared_difference(self.returns, self.value)
            critic_loss2 = tf.squared_difference(
                self.returns, self.old_value + tf.clip_by_value(
                    self.value - self.old_value, -self.epsilon, self.epsilon))
            self.vclipfrac = tf.reduce_mean(
                tf.to_float(
                    tf.greater(tf.abs(self.value - self.old_value),
                               self.epsilon)))
            self.critic_loss = tf.reduce_mean(
                tf.maximum(critic_loss1, critic_loss2)) * 0.5

            self.critic_in_loss = tf.reduce_mean(
                tf.squared_difference(self.returns_in, self.value_in)) * 0.5

        with tf.variable_scope('RND_loss'):
            self.rnd_loss = tf.reduce_mean(
                tf.square(
                    tf.stop_gradient(self.target_network) -
                    self.predictor_network))

        with tf.variable_scope('Total_loss'):
            self.loss = self.actor_loss - self.entropy * self.beta2 + (
                self.critic_loss + self.critic_in_loss) * self.beta

        params = tf.trainable_variables(self.name)
        with tf.variable_scope('train'):
            trainer = tf.train.AdamOptimizer(learning_rate=self.lr,
                                             epsilon=1e-5)
            grads_and_var = trainer.compute_gradients(self.loss, params)
            grads, var = zip(*grads_and_var)
            if self.max_grad_norm != None:
                grads, _ = tf.clip_by_global_norm(grads, self.max_grad_norm)
            grads_and_var = list(zip(grads, var))
            self.train = trainer.apply_gradients(grads_and_var)

        with tf.variable_scope('train_rnd'):
            self.train_rnd = tf.train.AdamOptimizer(
                learning_rate=self.rnd_lr).minimize(self.rnd_loss)
    def __init__(self, v_lr, pi_lr, model_dir, delta=1e-3):
        self.state = tf.placeholder(tf.float32, [None, 10], name='state')
        self.action = tf.placeholder(tf.float32, [None, 1], name='action')
        self.reward = tf.placeholder(tf.float32, [None, 1], name='reward')

        # Advantage function definition
        print(' [*] Building advantage function...')
        kwargs = {'kernel_initializer': tf.orthogonal_initializer()}
        with tf.variable_scope('value'):
            h1 = tf.layers.dense(self.state,
                                 128,
                                 activation=tf.nn.relu,
                                 name='h1',
                                 **kwargs)
            self.value = tf.layers.dense(h1,
                                         1,
                                         activation=None,
                                         name='value',
                                         **kwargs)
            self.advantage = self.reward - self.value

            self.v_loss = tf.reduce_mean(tf.square(self.advantage))
        v_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                   scope='value')
        self.v_train = tf.train.AdamOptimizer(v_lr).minimize(self.v_loss,
                                                             var_list=v_vars)

        # Policy function definition
        print(' [*] Building policy function...')
        self.policy, pi_vars = build_gaussian_network(self.state,
                                                      1,
                                                      scope='policy')
        old_policy, old_vars = build_gaussian_network(self.state,
                                                      1,
                                                      scope='policy',
                                                      trainable=False,
                                                      reuse=True)
        with tf.name_scope('policy_ops'):
            # self.assign_op = [old.assign(new) for old, new in zip(old_vars, pi_vars)]
            self.sample_op = self.policy.sample(1)
        with tf.name_scope('surrogate_loss'):
            ratio = self.policy.prob(self.action) / old_policy.prob(
                self.action)
            surrogate = ratio * self.advantage
            self.pi_loss = -tf.reduce_mean(surrogate)

        # Convert Adam gradient to natural gradient
        print(' [*] Building natural gradient...')
        with tf.variable_scope('policy_optim'):
            kl = tf.distributions.kl_divergence(old_policy, self.policy)
            optim = tf.train.AdamOptimizer(pi_lr)
            pi_grads_and_vars = optim.compute_gradients(surrogate,
                                                        var_list=pi_vars)
            pi_grads = [pair[0] for pair in pi_grads_and_vars]
            kl_grads = tf.gradients(kl, pi_vars)

            conj_grads = []
            for grad, kl_grad, var in zip(pi_grads, kl_grads, pi_vars):
                conj = build_conjugate_gradient(grad, kl_grad, var)
                nat_grad = tf.sqrt(
                    (2.0 * delta) /
                    (tf.reduce_sum(grad * conj) + EPSILON)) * conj
                conj_grads.append((nat_grad, var))
            self.pi_train = optim.apply_gradients(conj_grads)

        # Summaries definition
        print(' [*] Building summaries...')
        model_variance = tf.reduce_mean(self.policy._scale)
        self.sums = tf.summary.merge([
            tf.summary.scalar('max_rewards', tf.reduce_max(self.reward)),
            tf.summary.scalar('mean_advantage', tf.reduce_mean(
                self.advantage)),
            tf.summary.scalar('pi_loss', self.pi_loss),
            tf.summary.scalar('v_loss', self.v_loss),
            tf.summary.scalar('model_variance', model_variance)
        ],
                                     name='summaries')

        config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.sess.run(tf.global_variables_initializer())
        print(' [*] Model built finished')
        _, self.counter = load(self.sess, model_dir)
示例#30
0
 def lstm_cell(self, reuse=False):
     return tf.nn.rnn_cell.LSTMCell(self.rnn_size,
                                    initializer=tf.orthogonal_initializer(),
                                    reuse=reuse)
示例#31
0
def multi_hop_match(aware_repr, answer_repr, nb_hops, rnn_dim, attention_dim,
                    scope_name, ans_max_len, ans_lens, l2_reg):
    # aware_repr: [batch_size, feature_dim]
    # answer_repr: [batch_size, seq_length, answer_dim]
    # nb_hops: int
    # attention: int
    # rnn_dim: int
    with tf.variable_scope(scope_name):
        assert nb_hops > 0
        batch_size = batch_size = tf.shape(answer_repr)[0]
        aware_dim = aware_repr.get_shape().as_list()[-1]
        answer_dim = answer_repr.get_shape().as_list()[-1]

        # init memory
        ones_temp = tf.to_float(
            tf.reshape(tf.ones([batch_size, ans_max_len]),
                       [batch_size, ans_max_len, 1]))
        memories = tf.concat([answer_repr, ones_temp], axis=-1)

        attention_ws = tf.get_variable(
            name='W_al',
            shape=[nb_hops, 1, rnn_dim + answer_dim + aware_dim + 1],
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        attention_bs = tf.get_variable(
            name='B_al',
            shape=[nb_hops, 1, ans_max_len],
            initializer=tf.zeros_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        gru_r = tf.get_variable(
            name='W_r',
            shape=[rnn_dim, answer_dim + 1],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        gru_z = tf.get_variable(
            name='W_z',
            shape=[rnn_dim, answer_dim + 1],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        gru_g = tf.get_variable(
            name='W_g',
            shape=[rnn_dim, rnn_dim],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        gru_x = tf.get_variable(
            name='W_x',
            shape=[rnn_dim, answer_dim + 1],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg),
            dtype=tf.float32)
        gru_r_update = tf.get_variable(
            name='U_r',
            shape=[rnn_dim, rnn_dim],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg))
        gru_z_update = tf.get_variable(
            name='U_z',
            shape=[rnn_dim, rnn_dim],
            initializer=tf.orthogonal_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(l2_reg))

        e = tf.zeros([batch_size, rnn_dim])
        scores_list = []
        aware_repr = tf.tile(tf.expand_dims(aware_repr, 1),
                             [1, ans_max_len, 1])

        for h in range(nb_hops):
            memories_iter = tf.TensorArray(tf.float32,
                                           1,
                                           dynamic_size=True,
                                           infer_shape=False)
            memories_iter = memories_iter.unstack(memories)
            e_iter = tf.TensorArray(tf.float32,
                                    1,
                                    dynamic_size=True,
                                    infer_shape=False)
            e_iter = e_iter.unstack(e)
            aware_iter = tf.TensorArray(tf.float32,
                                        1,
                                        dynamic_size=True,
                                        infer_shape=False)
            aware_iter = aware_iter.unstack(aware_repr)
            sentence_lens_iter = tf.TensorArray(tf.int32,
                                                1,
                                                dynamic_size=True,
                                                infer_shape=False)
            sentence_lens_iter = sentence_lens_iter.unstack(ans_lens)
            newe = tf.TensorArray(size=batch_size, dtype=tf.float32)
            score = tf.TensorArray(size=batch_size, dtype=tf.float32)

            def body(i, newe, score):
                a = memories_iter.read(i)
                olde = e_iter.read(i)
                b = tf.tile(tf.expand_dims(olde, 0), [ans_max_len, 1])
                c = aware_iter.read(i)
                g = tf.matmul(
                    attention_ws[h],
                    tf.transpose(tf.concat([a, b, c], 1),
                                 perm=[1, 0])) + attention_bs[h]
                l = math_ops.to_int32(sentence_lens_iter.read(i))
                score_temp = tf.concat([
                    tf.nn.softmax(tf.slice(g, [0, 0], [1, l])),
                    tf.zeros([1, ans_max_len - l])
                ], 1)
                # score_temp = tf.nn.softmax(g)
                score = score.write(i, score_temp)
                i_AL = tf.reshape(tf.matmul(score_temp, a), [-1, 1])
                olde = tf.reshape(olde, [-1, 1])
                r = tf.nn.sigmoid(
                    tf.matmul(gru_r, i_AL) + tf.matmul(gru_r_update, olde))
                z = tf.nn.sigmoid(
                    tf.matmul(gru_z, i_AL) + tf.matmul(gru_z_update, olde))
                e0 = tf.nn.tanh(
                    tf.matmul(gru_x, i_AL) +
                    tf.matmul(gru_g, tf.multiply(r, olde)))
                newe_temp = tf.multiply(1 - z, olde) + tf.multiply(z, e0)
                newe = newe.write(i, newe_temp)
                return (i + 1, newe, score)

            def condition(i, newe, score):
                return i < batch_size

            _, newe_final, score_final = tf.while_loop(cond=condition,
                                                       body=body,
                                                       loop_vars=(0, newe,
                                                                  score))
            e = tf.reshape(newe_final.stack(), [-1, rnn_dim])
            batch_score = tf.reshape(score_final.stack(), [-1, ans_max_len])
            scores_list.append(batch_score)
    return e
示例#32
0
 def add_lstm_cells(self):
     cell = tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer())
     cell = tf.nn.rnn_cell.DropoutWrapper(cell, self.rnn_keep_prob)
     self.cell = cell
示例#33
0
 def gru_cell():
     return GRUCell(n_hidden,
                    kernel_initializer=tf.orthogonal_initializer())
示例#34
0
def initialize(sess=None):
  """Initialize data and model."""
  global MAXLEN_F
  # Create training directory if it does not exist.
  if not tf.gfile.IsDirectory(FLAGS.train_dir):
    data.print_out("Creating training directory %s." % FLAGS.train_dir)
    tf.gfile.MkDir(FLAGS.train_dir)
  decode_suffix = "beam%dln%d" % (FLAGS.beam_size,
                                  int(100 * FLAGS.length_norm))
  if FLAGS.mode == 0:
    decode_suffix = ""
  if FLAGS.task >= 0:
    data.log_filename = os.path.join(FLAGS.train_dir,
                                     "log%d%s" % (FLAGS.task, decode_suffix))
  else:
    data.log_filename = os.path.join(FLAGS.train_dir, "neural_gpu/log")

  # Set random seed.
  if FLAGS.random_seed > 0:
    seed = FLAGS.random_seed + max(0, FLAGS.task)
    tf.set_random_seed(seed)
    random.seed(seed)
    np.random.seed(seed)

  # Check data sizes.
  assert data.bins
  max_length = min(FLAGS.max_length, data.bins[-1])
  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]
  if sess is None and FLAGS.task == 0 and FLAGS.num_replicas > 1:
    if max_length > 60:
      max_length = max_length * 1 / 2  # Save memory on chief.
  min_length = min(14, max_length - 3) if FLAGS.problem == "wmt" else 3
  for p in FLAGS.problem.split("-"):
    if p in ["progeval", "progsynth"]:
      min_length = max(26, min_length)
  assert max_length + 1 > min_length
  while len(data.bins) > 1 and data.bins[-2] >= max_length + EXTRA_EVAL:
    data.bins = data.bins[:-1]

  # Create checkpoint directory if it does not exist.
  if FLAGS.mode == 0 or FLAGS.task < 0:
    checkpoint_dir = os.path.join(FLAGS.train_dir, "neural_gpu%s"
                                  % ("" if FLAGS.task < 0 else str(FLAGS.task)))
  else:
    checkpoint_dir = FLAGS.train_dir
  if not tf.gfile.IsDirectory(checkpoint_dir):
    data.print_out("Creating checkpoint directory %s." % checkpoint_dir)
    tf.gfile.MkDir(checkpoint_dir)

  # Prepare data.
  if FLAGS.problem == "wmt":
    # Prepare WMT data.
    data.print_out("Preparing WMT data in %s" % FLAGS.data_dir)
    if FLAGS.simple_tokenizer:
      MAXLEN_F = 3.5
      (en_train, fr_train, en_dev, fr_dev,
       en_path, fr_path) = wmt.prepare_wmt_data(
           FLAGS.data_dir, FLAGS.vocab_size,
           tokenizer=wmt.space_tokenizer,
           normalize_digits=FLAGS.normalize_digits)
    else:
      (en_train, fr_train, en_dev, fr_dev,
       en_path, fr_path) = wmt.prepare_wmt_data(
           FLAGS.data_dir, FLAGS.vocab_size)

    # Read data into buckets and compute their sizes.
    fr_vocab, rev_fr_vocab = wmt.initialize_vocabulary(fr_path)
    data.vocab = fr_vocab
    data.rev_vocab = rev_fr_vocab
    data.print_out("Reading development and training data (limit: %d)."
                   % FLAGS.max_train_data_size)
    dev_set = {}
    dev_set["wmt"] = read_data(en_dev, fr_dev, data.bins)
    def data_read(size, print_out):
      read_data_into_global(en_train, fr_train, data.bins, size, print_out)
    data_read(50000, False)
    read_thread_small = threading.Thread(
        name="reading-data-small", target=lambda: data_read(900000, False))
    read_thread_small.start()
    read_thread_full = threading.Thread(
        name="reading-data-full",
        target=lambda: data_read(FLAGS.max_train_data_size, True))
    read_thread_full.start()
    data.print_out("Data reading set up.")
  else:
    # Prepare algorithmic data.
    en_path, fr_path = None, None
    tasks = FLAGS.problem.split("-")
    data_size = FLAGS.train_data_size
    for t in tasks:
      data.print_out("Generating data for %s." % t)
      if t in ["progeval", "progsynth"]:
        data.init_data(t, data.bins[-1], 20 * data_size, FLAGS.vocab_size)
        if len(program_utils.prog_vocab) > FLAGS.vocab_size - 2:
          raise ValueError("Increase vocab_size to %d for prog-tasks."
                           % (len(program_utils.prog_vocab) + 2))
        data.rev_vocab = program_utils.prog_vocab
        data.vocab = program_utils.prog_rev_vocab
      else:
        for l in xrange(max_length + EXTRA_EVAL - 1):
          data.init_data(t, l, data_size, FLAGS.vocab_size)
        data.init_data(t, data.bins[-2], data_size, FLAGS.vocab_size)
        data.init_data(t, data.bins[-1], data_size, FLAGS.vocab_size)
      if t not in global_train_set:
        global_train_set[t] = []
      global_train_set[t].append(data.train_set[t])
      calculate_buckets_scale(data.train_set[t], data.bins, t)
    dev_set = data.test_set

  # Grid-search parameters.
  lr = FLAGS.lr
  init_weight = FLAGS.init_weight
  max_grad_norm = FLAGS.max_grad_norm
  if sess is not None and FLAGS.task > -1:
    def job_id_factor(step):
      """If jobid / step mod 3 is 0, 1, 2: say 0, 1, -1."""
      return ((((FLAGS.task / step) % 3) + 1) % 3) - 1
    lr *= math.pow(2, job_id_factor(1))
    init_weight *= math.pow(1.5, job_id_factor(3))
    max_grad_norm *= math.pow(2, job_id_factor(9))

  # Print out parameters.
  curriculum = FLAGS.curriculum_seq
  msg1 = ("layers %d kw %d h %d kh %d batch %d noise %.2f"
          % (FLAGS.nconvs, FLAGS.kw, FLAGS.height, FLAGS.kh,
             FLAGS.batch_size, FLAGS.grad_noise_scale))
  msg2 = ("cut %.2f lr %.3f iw %.2f cr %.2f nm %d d%.4f gn %.2f %s"
          % (FLAGS.cutoff, lr, init_weight, curriculum, FLAGS.nmaps,
             FLAGS.dropout, max_grad_norm, msg1))
  data.print_out(msg2)

  # Create model and initialize it.
  tf.get_variable_scope().set_initializer(
      tf.orthogonal_initializer(gain=1.8 * init_weight))
  max_sampling_rate = FLAGS.max_sampling_rate if FLAGS.mode == 0 else 0.0
  o = FLAGS.vocab_size if FLAGS.max_target_vocab < 1 else FLAGS.max_target_vocab
  ngpu.CHOOSE_K = FLAGS.soft_mem_size
  do_beam_model = FLAGS.train_beam_freq > 0.0001 and FLAGS.beam_size > 1
  beam_size = FLAGS.beam_size if FLAGS.mode > 0 and not do_beam_model else 1
  beam_size = min(beam_size, FLAGS.beam_size)
  beam_model = None
  def make_ngpu(cur_beam_size, back):
    return ngpu.NeuralGPU(
        FLAGS.nmaps, FLAGS.vec_size, FLAGS.vocab_size, o,
        FLAGS.dropout, max_grad_norm, FLAGS.cutoff, FLAGS.nconvs,
        FLAGS.kw, FLAGS.kh, FLAGS.height, FLAGS.mem_size,
        lr / math.sqrt(FLAGS.num_replicas), min_length + 3, FLAGS.num_gpus,
        FLAGS.num_replicas, FLAGS.grad_noise_scale, max_sampling_rate,
        atrous=FLAGS.atrous, do_rnn=FLAGS.rnn_baseline,
        do_layer_norm=FLAGS.layer_norm, beam_size=cur_beam_size, backward=back)
  if sess is None:
    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
      model = make_ngpu(beam_size, True)
      if do_beam_model:
        tf.get_variable_scope().reuse_variables()
        beam_model = make_ngpu(FLAGS.beam_size, False)
  else:
    model = make_ngpu(beam_size, True)
    if do_beam_model:
      tf.get_variable_scope().reuse_variables()
      beam_model = make_ngpu(FLAGS.beam_size, False)

  sv = None
  if sess is None:
    # The supervisor configuration has a few overriden options.
    sv = tf.train.Supervisor(logdir=checkpoint_dir,
                             is_chief=(FLAGS.task < 1),
                             saver=model.saver,
                             summary_op=None,
                             save_summaries_secs=60,
                             save_model_secs=15 * 60,
                             global_step=model.global_step)

    config = tf.ConfigProto(allow_soft_placement=True)
    sess = sv.PrepareSession(FLAGS.master, config=config)

  data.print_out("Created model. Checkpoint dir %s" % checkpoint_dir)

  # Load model from parameters if a checkpoint exists.
  ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + ".index"):
    data.print_out("Reading model parameters from %s"
                   % ckpt.model_checkpoint_path)
    model.saver.restore(sess, ckpt.model_checkpoint_path)
  elif sv is None:
    sess.run(tf.global_variables_initializer())
    data.print_out("Initialized variables (no supervisor mode).")
  elif FLAGS.task < 1 and FLAGS.mem_size > 0:
    # sess.run(model.mem_norm_op)
    data.print_out("Created new model and normalized mem (on chief).")

  # Return the model and needed variables.
  return (model, beam_model, min_length, max_length, checkpoint_dir,
          (global_train_set, dev_set, en_path, fr_path), sv, sess)
示例#35
0
 def get_instance(args):
     """
     create an instance of the initializer
     """
     gain = float(args.get('gain', 1.0))
     return tf.orthogonal_initializer(gain, seed=SEED)
示例#36
0
def main(model, T, n_epochs, n_batch, n_hidden, learning_rate, decay, nb_v,
         norm, capacity, n_layers, clip_threshold, keep_prob, lr_decay,
         max_n_epoch, grid_name, is_gates, n_hyper_hidden, layer_norm,
         slow_size, fast_size):
    max_len_data = 1000000000
    epoch_train, vocab_to_idx = file_data('train', n_batch, max_len_data, T,
                                          n_epochs, None)
    n_input = len(vocab_to_idx)
    epoch_val, _ = file_data('valid', nb_v, max_len_data, T, 10000,
                             vocab_to_idx)
    epoch_test, _ = file_data('test', nb_v, max_len_data, T, 10000,
                              vocab_to_idx)
    n_output = n_input

    x = tf.placeholder("int64", [None, T])
    y = tf.placeholder("int64", [None, T])
    new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
    lr = tf.get_variable("learning_rate",
                         shape=[],
                         dtype=tf.float32,
                         trainable=False)
    update = tf.assign(lr, new_lr)

    if model == "LSTM":
        i_s = tuple([
            LSTMStateTuple(tf.placeholder("float", [None, n_hidden]),
                           tf.placeholder("float", [None, n_hidden]))
            for _ in range(n_layers)
        ])
    elif model == "HyperDRUM":
        i_s = tuple([
            LSTMStateTuple(
                tf.placeholder("float", [None, n_hyper_hidden]),
                tf.placeholder("float", [None, n_hidden + n_hyper_hidden]))
            for _ in range(n_layers)
        ])
    elif model == "FSRUM":
        i_s = tuple([
            tuple([
                tuple([
                    tf.placeholder("float", [None, fast_size]),
                    tf.placeholder("float", [None, fast_size])
                ]),
                tf.placeholder("float", [None, slow_size])
            ]) for _ in range(n_layers)
        ])

    else:
        i_s = tuple([
            tf.placeholder("float", [None, n_hidden]) for _ in range(n_layers)
        ])
    input_data = tf.one_hot(x, n_input, dtype=tf.float32)
    if keep_prob != None:
        tf.nn.dropout(input_data, keep_prob)

    if model == "HyperDRUM":

        def hyperdrum_cell():
            return HyperDRUMCell(n_hidden,
                                 hyper_num_units=n_hyper_hidden,
                                 use_recurrent_dropout=layer_norm,
                                 normalization=norm)

        mcell = MultiRNNCell([hyperdrum_cell() for _ in range(n_layers)],
                             state_is_tuple=True)
    if model == "RUM":

        def rum_cell():
            return RUMCell(n_hidden,
                           T_norm=1.0,
                           use_zoneout=True,
                           use_layer_norm=True)

        mcell = MultiRNNCell([rum_cell() for _ in range(n_layers)],
                             state_is_tuple=True)
    if model == "FSRUM":

        def rum_cell():
            return RUMCell(slow_size,
                           T_norm=1.0,
                           use_zoneout=True,
                           use_layer_norm=True)

        def ln_lstm_cell():
            return LN_LSTMCell(fast_size,
                               use_zoneout=True,
                               is_training=True,
                               zoneout_keep_h=True,
                               zoneout_keep_c=True)

        # def fs_rum_cell():
        # 	return FSRNNCell([ln_lstm_cell(), ln_lstm_cell()], rum_cell(), 0.65, training = True)
        mcell = MultiRNNCell([fs_rum_cell() for _ in range(n_layers)],
                             state_is_tuple=True)
    if model == "LSTM":

        def lstm_cell():
            return LSTMCell(n_hidden, initializer=tf.orthogonal_initializer())

        mcell = MultiRNNCell([lstm_cell() for _ in range(n_layers)],
                             state_is_tuple=True)
    if model == "EUNN":

        def eunn_cell(i):
            return EUNNCell(n_hidden, capacity=capacity, comp=False, name=i)

        mcell = MultiRNNCell([eunn_cell(str(i)) for i in range(n_layers)],
                             state_is_tuple=True)
    if model == "GRU":

        def gru_cell():
            return GRUCell(n_hidden,
                           kernel_initializer=tf.orthogonal_initializer())

        mcell = MultiRNNCell([gru_cell() for _ in range(n_layers)],
                             state_is_tuple=True)
    hidden_out, states = tf.nn.dynamic_rnn(mcell,
                                           input_data,
                                           dtype=tf.float32,
                                           initial_state=i_s)

    V_init_val = np.sqrt(6.) / np.sqrt(n_output + n_input)
    V_weights = tf.get_variable(
        "V_weights",
        shape=[n_hidden, n_output],
        dtype=tf.float32,
        initializer=tf.orthogonal_initializer(gain=V_init_val))
    V_bias = tf.get_variable("V_bias",
                             shape=[n_output],
                             dtype=tf.float32,
                             initializer=tf.constant_initializer(0.01))
    hidden_out_list = tf.unstack(hidden_out, axis=1)
    temp_out = tf.stack([tf.matmul(i, V_weights) for i in hidden_out_list])
    output_data = tf.nn.bias_add(tf.transpose(temp_out, [1, 0, 2]), V_bias)
    if keep_prob != None:
        tf.nn.dropout(output_data, keep_prob)

    cost = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output_data,
                                                       labels=y))
    correct_pred = tf.equal(tf.argmax(output_data, 2), y)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    optimizer = tf.train.AdamOptimizer(learning_rate=lr)

    train_op = optimizer.minimize(cost)

    init = tf.global_variables_initializer()
    for i in tf.global_variables():
        print(i.name)
    tmp_filename = "./output/character/"
    if grid_name is not None:
        tmp_filename += grid_name + "/"
    tmp_filename += "T=" + str(T) + "/"
    filename = tmp_filename + str(n_layers) + model  +  "_N=" + str(n_hidden) + \
         "_B=" + str(n_batch) + "_nb_v=" + str(nb_v) + \
         "_numEpochs=" + str(n_epochs) + "_lr=" + str(learning_rate)
    if norm is not None:
        filename += "_norm=" + str(norm)
    if keep_prob is not None:
        filename += "_keepProb=" + str(keep_prob)
    filename = filename + ".txt"
    research_filename = tmp_filename + "researchModels" + "/" + \
         str(n_layers) + model  +  "_N=" + str(n_hidden) + \
            "_B=" + str(n_batch) + "_nb_v=" + str(nb_v) + \
            "_numEpochs=" + str(n_epochs) + "_lr=" + str(learning_rate)
    if not os.path.exists(os.path.dirname(filename)):
        try:
            os.makedirs(os.path.dirname(filename))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    if not os.path.exists(os.path.dirname(research_filename)):
        try:
            os.makedirs(os.path.dirname(research_filename))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    if not os.path.exists(
            os.path.dirname(research_filename + "/modelCheckpoint/")):
        try:
            os.makedirs(
                os.path.dirname(research_filename + "/modelCheckpoint/"))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise
    f = open(filename, 'w')
    f.write("########\n\n")
    f.write("## \tModel: %s with N=%d" % (model, n_hidden))
    f.write("\n\n")
    f.write("########\n\n")

    def do_test():
        j = 0
        test_losses = []
        for test in epoch_test:
            j += 1
            if j >= 2:
                break
            print("Running test...")
            if model == "LSTM":
                test_state = tuple([
                    LSTMStateTuple(np.zeros((nb_v, n_hidden), dtype=np.float),
                                   np.zeros((nb_v, n_hidden), dtype=np.float))
                    for _ in range(n_layers)
                ])
            elif model == "HyperDRUM":
                test_state = tuple([
                    LSTMStateTuple(
                        np.zeros((nb_v, n_hyper_hidden), dtype=np.float),
                        np.zeros((nb_v, n_hyper_hidden + n_hidden),
                                 dtype=np.float)) for _ in range(n_layers)
                ])
            elif model == "FSRUM":
                test_state = tuple([
                    tuple([
                        tuple([
                            np.zeros((nb_v, fast_size), dtype=np.float),
                            np.zeros((nb_v, fast_size), dtype=np.float)
                        ]),
                        np.zeros((nb_v, slow_size), dtype=np.float)
                    ]) for _ in range(n_layers)
                ])
            else:
                test_state = tuple([
                    np.zeros((nb_v, n_hidden), dtype=np.float)
                    for _ in range(n_layers)
                ])
            for stepb, (X_test, Y_test) in enumerate(test):
                test_batch_x = X_test
                test_batch_y = Y_test
                test_dict = {x: test_batch_x, y: test_batch_y, i_s: test_state}
                test_acc, test_loss, test_state = sess.run(
                    [accuracy, cost, states], feed_dict=test_dict)
                test_losses.append(test_loss)
        print("test:", )
        test_losses.append(sum(test_losses) / len(test_losses))
        print("test Loss= " + "{:.6f}".format(test_losses[-1]))
        return test_losses[-1]

    def do_validation(loss, curr_epoch):
        curr_epoch = int(curr_epoch)
        j = 0
        val_losses = []
        val_max = 0
        val_norm_max = 0
        for val in epoch_val:
            j += 1
            if j >= 2:
                break
            print("Running validation...")
            if model == "LSTM":
                val_state = tuple([
                    LSTMStateTuple(np.zeros((nb_v, n_hidden), dtype=np.float),
                                   np.zeros((nb_v, n_hidden), dtype=np.float))
                    for _ in range(n_layers)
                ])
            elif model == "HyperDRUM":
                val_state = tuple([
                    LSTMStateTuple(
                        np.zeros((nb_v, n_hyper_hidden), dtype=np.float),
                        np.zeros((nb_v, n_hyper_hidden + n_hidden),
                                 dtype=np.float)) for _ in range(n_layers)
                ])
            elif model == "FSRUM":
                val_state = tuple([
                    tuple([
                        tuple([
                            np.zeros((nb_v, fast_size), dtype=np.float),
                            np.zeros((nb_v, fast_size), dtype=np.float)
                        ]),
                        np.zeros((nb_v, slow_size), dtype=np.float)
                    ]) for _ in range(n_layers)
                ])
            else:
                val_state = tuple([
                    np.zeros((nb_v, n_hidden), dtype=np.float)
                    for _ in range(n_layers)
                ])
            for stepb, (X_val, Y_val) in enumerate(val):
                val_batch_x = X_val
                val_batch_y = Y_val
                val_dict = {x: val_batch_x, y: val_batch_y, i_s: val_state}
                val_acc, val_loss, val_state = sess.run(
                    [accuracy, cost, states], feed_dict=val_dict)
                val_losses.append(val_loss)
        print("Validations:", )
        validation_losses.append(sum(val_losses) / len(val_losses))
        print("Validation Loss= " + "{:.6f}".format(validation_losses[-1]))
        test_loss = do_test()
        lr = [v for v in tf.global_variables()
              if v.name == "learning_rate:0"][0]
        lr = sess.run(lr)
        f.write(
            "Step: %d\t TrLoss: %f\t TestLoss: %f\t ValLoss: %f\t Epoch: %d\t Learning rate: %f\n"
            % (t, loss, test_loss, validation_losses[-1], curr_epoch, lr))
        f.flush()

    saver = tf.train.Saver()
    step = 0
    with tf.Session(config=tf.ConfigProto(log_device_placement=False,
                                          allow_soft_placement=False)) as sess:
        print("Session Created")
        steps = []
        losses = []
        accs = []
        validation_losses = []

        sess.run(init)
        if lr_decay == None:
            sess.run(update, feed_dict={new_lr: learning_rate})

        if model == "LSTM":
            training_state = tuple([
                LSTMStateTuple(np.zeros((n_batch, n_hidden), dtype=np.float),
                               np.zeros((n_batch, n_hidden), dtype=np.float))
                for _ in range(n_layers)
            ])
        elif model == "HyperDRUM":
            training_state = tuple([
                LSTMStateTuple(
                    np.zeros((n_batch, n_hyper_hidden), dtype=np.float),
                    np.zeros((n_batch, n_hyper_hidden + n_hidden),
                             dtype=np.float)) for _ in range(n_layers)
            ])
        elif model == "FSRUM":
            training_state = tuple([
                tuple([
                    tuple([
                        np.zeros((n_batch, fast_size), dtype=np.float),
                        np.zeros((n_batch, fast_size), dtype=np.float)
                    ]),
                    np.zeros((n_batch, slow_size), dtype=np.float)
                ]) for _ in range(n_layers)
            ])
        else:
            training_state = tuple([
                np.zeros((n_batch, n_hidden), dtype=np.float)
                for _ in range(n_layers)
            ])
        i = 0
        t = 0
        val_cnt = 0
        for epoch in epoch_train:
            print("Epoch: ", i)
            if lr_decay != None:
                sess.run(update,
                         feed_dict={
                             new_lr:
                             learning_rate *
                             (lr_decay**max(i + 1 - max_n_epoch, 0.0))
                         })

            for step, (X, Y) in enumerate(epoch):
                batch_x = X
                batch_y = Y
                myfeed_dict = {x: batch_x, y: batch_y, i_s: training_state}
                _, acc, loss, training_state = sess.run(
                    [train_op, accuracy, cost, states], feed_dict=myfeed_dict)
                lr = [
                    v for v in tf.global_variables()
                    if v.name == "learning_rate:0"
                ][0]
                lr = sess.run(lr)
                print("Iter " + str(t) + ", Minibatch Loss= " +
                      "{:.6f}".format(loss) + ", Training Accuracy= " +
                      "{:.5f}".format(acc) + ", Epoch " + str(i) +
                      ", Learning rate= " + str(lr))
                steps.append(t)
                losses.append(loss)
                accs.append(acc)
                t += 1
                if step % 499 == 500:
                    do_validation(loss, i)
                    if is_gates and (model == "GRU"
                                     or model == "DRUM") and (n_layers == 1):
                        if model == "GRU": tmp = "gru"
                        if model == "DRUM": tmp = "drum"
                        kernel = [
                            v for v in tf.global_variables()
                            if v.name == "rnn/multi_rnn_cell/cell_0/" + tmp +
                            "_cell/gates/kernel:0"
                        ][0]
                        bias = [
                            v for v in tf.global_variables()
                            if v.name == "rnn/multi_rnn_cell/cell_0/" + tmp +
                            "_cell/gates/bias:0"
                        ][0]
                        k, b = sess.run([kernel, bias])
                        np.save(research_filename + "/kernel_" + str(val_cnt),
                                k)
                        np.save(research_filename + "/bias_" + str(val_cnt), b)
                        val_cnt += 1
            i += 1
            saver.save(sess, research_filename + "/modelCheckpoint/model")
        print("Optimization Finished!")

        test_loss = do_test()
        f.write("Test result: %d (step) \t%f (loss)\n" % (t, test_loss[-1]))
def train_net(batch_size=100,
              t_steps=100,
              l_dim=8*[240],
              act=tf.nn.tanh,
              alpha=0.1,
              beta0=0.,
              beta1=1.,
              beta2=0.,
              noise_str=0.5,
              learning_rate=0.01,
              learning_rate_inv=0.01,
              err_alg=1,
              mode='autoencoder',
              dataset='mnist',
              preprocess=False,
              return_sess=False):
  """
    Args:
      batch_size: batch size
      t_steps: number of training steps
      l_dim: list of network architecture / dimension of 'hidden' layers, not including input and output layer.
      alpha: in (0,1], scaling for top layer target;  x_tar[-1] = x[-1] - alpha*(dL/dx[-1])
      beta0: regularization constant
      beta1: regularization constant
      beta2: regularization constant
      noise_str: value of standard dev of noise injected into neurons, but only for the L_inv loss functions, and for t_step=0 (decays through training)
      learning_rate: learning rate for optimization
      err_alg: error propagation method. 0 for difference target prop. 1 for regularized target prop. 2 for reg target prop with learnable inverses. 3 for backprop.
      mode: 'autoencoder' or 'classification'
      dataset: 'mnist' or 'cifar'
      preprocess: bool. PCA+whiten the data? Good for cifar but whatevs for mnist
      return_sess: should we return the tf session?
    Returns:
      sess: the tf session if return_sess is True
  """

  # Params from conti_dtp.py -- unclear if this is one hyperparam search or the optimal one
  # alpha, L learning rate, L_inv learning rate, noise_inj
  # 0.327736332653, 0.0148893490317, 0.00501149118237, 0.359829566008

  ### DATA ###
  if dataset == 'cifar':
    data = ds.cifar10_data()
    data_test = ds.cifar10_data_test()
  elif dataset == 'mnist':
    data = ds.mnist_data()
    data_test = ds.mnist_data_test()

  if preprocess:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=1000, whiten=True)
    data.inputs = pca.fit_transform(data.inputs)
    data_test.inputs = pca.transform(data_test.inputs)

  if mode == 'autoencoder':
    # autoencoderify
    data.outputs = data.inputs
    data_test.outputs = data_test.inputs

  m_dim = data.inputs.shape[1] # input dimension
  p_dim = data.outputs.shape[1] # output dimension

  l_dim = [m_dim] + l_dim + [p_dim] # layer dimensions
  layers = len(l_dim)-1

  ### MODEL ###
  tf.reset_default_graph()
  tf.set_random_seed(1234)
  np.random.seed(1234)

  # placeholders
  x_in = tf.placeholder(tf.float32, shape=[None, m_dim], name='x_in') # Input
  y = tf.placeholder(tf.float32, shape=[None, p_dim], name='y') # Output
  epoch = tf.placeholder(tf.float32, shape=None, name='epoch') # training iteration

  # in dtp code, 0.5/(1 + epoch / 100)
  noise_inj = noise_str/(1.+epoch/100.) # std dev of noise in L_inv loss

  # initialize lists
  x = (layers+1)*[None] # activations
  W = (layers+1)*[None] # feedforward matrix
  b = (layers+1)*[None] # feedforward bias

  x_ = (layers+1)*[None] # targets
  V = (layers+1)*[None] # feedback matrix
  c = (layers+1)*[None] # feedback bias

  L = (layers+1)*[None] # local layer loss for training W and b
  L_inv = (layers+1)*[None] # local inverse loss for training V and c
  L_inv0 = (layers+1)*[None] # (testing)
  L_inv1 = (layers+1)*[None]
  L_inv2 = (layers+1)*[None]
  eps = (layers+1)*[None] # noise in L_inv term
  eps0 = (layers+1)*[None] # (testing)
  eps1 = (layers+1)*[None]

  vscope = (layers+1)*[None] # variable scopes

  train_op_L = (layers+1)*[None] # training op
  train_op_inv = (layers+1)*[None] # training op

  # init with numpy arrays
  from scipy import linalg
  for l in range(1, layers+1):
    low = -np.sqrt(6.0/(l_dim[l-1] + l_dim[l]))
    high = np.sqrt(6.0/(l_dim[l-1] + l_dim[l]))
    W[l] = np.random.uniform(low=low, high=high, size=(l_dim[l-1], l_dim[l])).astype('float32')
    if l_dim[l-1] >= l_dim[l]:
      W[l] = 1.0*linalg.orth(W[l])

  # transpose for autoencoder
  if mode == 'autoencoder':
    for l in range(layers/2+1, layers+1):
      W[l] = W[layers+1-l].T

  for l in range(layers, 1, -1):
    if err_alg==0 or err_alg==1:
      #V[l] = np.linalg.pinv(W[l])
      low = -np.sqrt(6.0/(l_dim[l-1] + l_dim[l]))
      high = np.sqrt(6.0/(l_dim[l-1] + l_dim[l]))
      V[l] = np.random.uniform(low=low, high=high, size=(l_dim[l], l_dim[l-1])).astype('float32')
      if l_dim[l] >= l_dim[l-1]:
        V[l] = 1.0*linalg.orth(V[l])
    if err_alg==2:
      pinv = np.linalg.pinv(W[l])
      V[l] = np.concatenate((pinv, np.eye(l_dim[l-1]) - np.dot(W[l], pinv)), axis=0).astype('float32')

  # Variable creation
  # xavier:
  # tf.contrib.layers.variance_scaling_initializer(factor=1.0, mode='FAN_AVG', uniform=True)
  # orth:
  # tf.orthogonal_initializer(0.5)

  # feedforward variables
  for l in range(1, layers+1):
    with tf.variable_scope('vars_Layer'+str(l)) as vscope[l]:
      b[l] = tf.get_variable( 'b', shape=[1, l_dim[l]], initializer=tf.constant_initializer(0.0))
      W[l] = tf.get_variable( 'W', shape=[l_dim[l-1], l_dim[l]], initializer=tf.orthogonal_initializer())
      #W[l] = tf.get_variable( 'W', initializer=W[l])

  # feedback variables
  for l in range(layers, 1, -1):
    with tf.variable_scope(vscope[l]):
      if err_alg==0 or err_alg==1:
        c[l] = tf.get_variable( 'c', shape=[1, l_dim[l-1]], initializer=tf.constant_initializer(0.0))
        V[l] = tf.get_variable( 'V', shape=[l_dim[l], l_dim[l-1]], initializer=tf.orthogonal_initializer())
        #V[l] = tf.get_variable( 'V', initializer=V[l])
      if err_alg==2:
        c[l] = tf.get_variable( 'c', shape=[1, l_dim[l-1]],  initializer=tf.constant_initializer(0.0))
        V[l] = tf.get_variable( 'V', shape=[l_dim[l]+l_dim[l-1], l_dim[l-1]], initializer=tf.orthogonal_initializer())
        #V[l] = tf.get_variable( 'V', initializer=V[l])

  # feedforward functions
  def f(layer, x_in, act=tf.nn.tanh):
    with tf.variable_scope(vscope[layer], reuse=True):
      # note: could also just use W[l] and b[l]
      W_ = tf.get_variable('W')
      b_ = tf.get_variable('b')
    return act(tf.add(tf.matmul(x_in, W_), b_), name='x')

  # Feedback functions
  def g(layer, x_target, act=tf.nn.tanh):
    with tf.variable_scope(vscope[layer], reuse=True):
      V_ = tf.get_variable('V')
      c_ = tf.get_variable('c')
    return act(tf.add(tf.matmul(x_target, V_), c_), name='x_')

  def g_dtp(layer, x1_target, x1_activation, x0_activation, act=tf.nn.tanh):
    with tf.variable_scope(vscope[layer], reuse=True):
      V_ = tf.get_variable('V')
      c_ = tf.get_variable('c')
    return tf.add(x0_activation,
                  tf.sub(act(tf.add(tf.matmul(x1_target,     V[layer], name='x3_'), c[layer], name='x2_'), name='x1_'),
                         act(tf.add(tf.matmul(x1_activation, V[layer], name='x3_'), c[layer], name='x2_'), name='x1_')), name='x_target')

  def g_rinv(layer, x1_target, x0_activation):
    with tf.variable_scope(vscope[layer], reuse=True):
      V_ = tf.get_variable('V')
      c_ = tf.get_variable('c')
    relu_inv = tf.py_func(ops.relu().f_inv, [x1_target, x0_activation], [tf.float32], name='x3_')[0]
    add_inv = tf.sub(relu_inv, b[layer], name='x2_')
    return tf.py_func(ops.linear().f_inv, [add_inv,  x0_activation, W[layer]], [tf.float32], name='x1_')[0]

  # TESTING
  # def g_full(layer, input1, input2, act=tf.nn.tanh):
  #   """ generalized g. g(x_[layer], x[layer-1]) -> x_[layer-1] """
  #   with tf.name_scope(scope[l]):
  #     V[layer] = tf.get_variable( 'V' )
  #     c[layer] = tf.get_variable( 'c' )
  #     return act(tf.matmul(tf.concat( 1, [input1, input2] ), V[layer]) + c[layer], name='g_full')

  # def g_full2(layer, input1, input2, input3, act=tf.nn.tanh):
  #   """ generalized g. g(x_[layer], x[layer-1]) -> x_[layer-1] """
  #   with tf.name_scope('Layer'):
  #     V[layer] = tf.get_variable( 'V' )
  #     c[layer] = tf.get_variable( 'c' )
  #   return act(tf.matmul(tf.concat( 1, [input1, input2, input3] ), V[layer]) + c[layer], name='g_full')
  # /TESTING

  # forward propagation
  x[0] = x_in
  for l in range(1, layers+1):
    with tf.name_scope('layer'+str(l)+'_ff'):
      if l==layers and mode=='classification':
        # last layer
        x[layers] = f(layers, x[layers-1], tf.nn.softmax)
      else:
        # other layers
        x[l] = f(l, x[l-1], act)

  # top layer loss / top layer target
  # L[-1] = tf.nn.softmax_cross_entropy_with_logits(x[-1], y)
  with tf.name_scope('top_layer'):
    if mode == 'classification':
      #L[-1] = tf.reduce_mean(-tf.reduce_sum(y*tf.log(x[-1] + 1e-10), reduction_indices=[1]), name='global_loss') # add 1e-10 so you don't get nan'd
      L[-1] = tf.reduce_mean((x[-1] - y)**2, name='global_loss')
    elif mode == 'autoencoder':
      L[-1] = tf.reduce_mean((x[-1] - y)**2, name='global_loss')
    x_[-1] = tf.sub(x[-1], alpha*(x[-1] - y), name='x_target_top') 

  # feedback propagation
  for l in range(layers, 1, -1):
    with tf.name_scope('layer'+str(l)+'_fb'):
      if err_alg==0:
        x_[l-1] = tf.add(x[l-1] - g(l, x[l], act), g(l, x_[l], act), name='x_target')
      if err_alg==1:
        x_[l-1] = g_rinv(l, x_[l], x[l-1])

  # noise terms for loss functions
  if err_alg==0 or err_alg==2:
    for l in range(1, layers+1):
      with tf.name_scope('layer'+str(l)+'_eps'):
        eps[l]  = tf.random_normal(tf.shape(x[l]), mean=0, stddev=noise_inj, name='eps'+str(l-1))
        #eps0[l] = noise_inj*tf.random_normal(tf.shape(x[l]), mean=0, stddev=1., name='eps0'+str(l-1)) # uh, tf.shape(x[l-1]) right?
        #eps1[l] = noise_inj*tf.random_normal(tf.shape(x[l]), mean=0, stddev=1., name='eps1'+str(l-1)) # uh, tf.shape(x[l-1]) right?

  # loss functions
  for l in range(1, layers): # FOR NOW; LAYERS+1, BUT SHOULD BE LAYERS
    with tf.name_scope('layer'+str(l)+'_loss'):
      if err_alg!=3:
        L[l] = tf.reduce_mean((x[l] - tf.stop_gradient(x_[l]))**2, name='Loss') # note: stop_gradients not necessary
  for l in range(2, layers+1):
    with tf.name_scope('layer'+str(l)+'_loss_inv'):
      if err_alg==0:
        L_inv[l] = tf.reduce_mean((g(l, tf.stop_gradient(f(l, x[l-1]+eps[l-1], act)), act) - tf.stop_gradient(x[l-1]+eps[l-1]))**2, name='L_inv')
      if err_alg==1:
        pass
      if err_alg==2:
        # STILL TESTING
        # L_inv0 - g as left inverse of f; regardless of what x_0 is, g should send f(x) to x. just use, for now, the activation x+eps
        L_inv0[l] = tf.reduce_mean((g_full(l, f(l, x[l-1]+eps0[l-1], act), x[l-1], act) - (x[l-1]+eps0[l-1]))**2, name='L_inv0')
        # L_inv1 - g as the right inverse of f; regardless of what x_0 is, f should send g(y) to y; make sure to use x_targ as y because that's what matters
        L_inv1[l] = tf.reduce_mean((f(l, g_full(l, x_[l]+eps1[l], x[l-1], act), act) - (x_[l]+eps1[l]))**2, name='L_inv1')
        # L_inv2 - g should send y close to x_0
        L_inv2[l] = tf.reduce_mean((g_full(l, x_[l], x[l-1], act) - x[l-1])**2, name='L_inv2')
        L_inv[l] = beta0*L_inv0[l] + beta1*L_inv1[l] + beta2*L_inv2[l]
        # L_inv[l] = tf.add(L_inv1[l], beta*L_inv2[l], name='L_inv')
        # L_inv[l] = tf.add(tf.reduce_mean(0.5*(f(l, g_full(l, x_[l]+eps[l], x[l], x[l-1])) - x_[l]-eps[l])**2), beta*tf.reduce_mean(0.5*(g_full(l, x_[l], x[l], x[l-1]) - x[l-1])**2), name='L_inv') # triple check -- where to put beta, where to put reduce_means? 

  # optimizers
  if err_alg!=3:
    for l in range(1, layers+1):
      with tf.name_scope('layer'+str(l)+'_opts'):
        train_op_L[l] = tf.train.RMSPropOptimizer(learning_rate, name='Opt').minimize(L[l], var_list=[W[l], b[l]])
  if err_alg==0 or err_alg==2:
    for l in range(2, layers+1):
      with tf.name_scope('layer'+str(l)+'_opts_inv'):
        train_op_inv[l] = tf.train.RMSPropOptimizer(learning_rate_inv, name='Opt_inv').minimize(L_inv[l], var_list=[V[l], c[l]])
  if err_alg==3:
    train_op_L[-1] = tf.train.RMSPropOptimizer(learning_rate, name='Opt').minimize(L[-1], var_list=[i for i in W+b if i is not None])

  if mode == 'classification':
    correct_prediction = tf.equal(tf.argmax(x[-1], 1), tf.argmax(y,1)) # note: normally, tf.nn.softmax(x[-1]), but we already softmax'd
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  elif mode == 'autoencoder':
    accuracy = tf.constant(0) # :(

  # clean up
  train_op_L = [i for i in train_op_L if i is not None]
  train_op_inv = [i for i in train_op_inv if i is not None]

  # tensorboard
  with tf.name_scope('key_summaries'):
    tf.summary.scalar('accuracy', accuracy)
    tf.summary.scalar('global_loss', L[-1])
  with tf.name_scope('layer_losses'):
    for l in range(layers+1):
      if L[l] is not None:
        tf.summary.scalar('L'+str(l), L[l])
      if L_inv[l] is not None:
        tf.summary.scalar('L_inv'+str(l), L_inv[l])
  with tf.name_scope('weights'):
    for varlist in ['W', 'V', 'b', 'c']:
      for iv, var in enumerate(eval(varlist)):
        if var is not None:
          tf.summary.histogram(varlist+str(iv), var)
  with tf.name_scope('grads'):
    for varlist in ['W', 'b']:
      for iv, var in enumerate(eval(varlist)):
        if var is not None and L[iv] is not None:
          tf.summary.histogram('grad'+varlist+str(iv), tf.gradients(L[iv], [var])[0]) # does this actually recompute gradients? if so, whatevs
    for varlist in ['V', 'c']:
      for iv, var in enumerate(eval(varlist)):
        if var is not None and L_inv[iv] is not None:
          tf.summary.histogram('grad'+varlist+str(iv), tf.gradients(L_inv[iv], [var])[0])

  merged_summary_op = tf.summary.merge_all()

  ### TRAIN ###
  sess = tf.Session()
  sess.run(tf.global_variables_initializer())

  make_dir('/tmp/targ-prop/')
  run = str(len(os.listdir('/tmp/targ-prop'))+1)
  print 'Run: '+run
  summary_writer = tf.summary.FileWriter('/tmp/targ-prop/'+str(run), sess.graph)

  for i in range(t_steps):
    x_batch, y_batch = data.next_batch(batch_size)
    feed_dict = {x_in: x_batch, y: y_batch, epoch: i}    
    sess.run(train_op_inv, feed_dict=feed_dict)
    sess.run(train_op_L, feed_dict=feed_dict)

    if i % 25 == 0:
      loss_val, summary_str, acc_val = sess.run([L[-1], merged_summary_op, accuracy], feed_dict=feed_dict)
      summary_writer.add_summary(summary_str, i)

    if i % 200 == 0:
      x_test, y_test = data_test.inputs, data_test.outputs
      feed_dict = {x_in: x_test, y: y_test, epoch: i}
      loss_val_test, acc_val_test = sess.run([L[-1], accuracy], feed_dict=feed_dict)
      print "iter:", "%04d" % (i), \
        "| TRAINING ", \
        "loss:", "{:.4f}".format(loss_val), \
        "accuracy:", "{:.4f}".format(acc_val), \
        "| TEST ", \
        "loss:", "{:.4f}".format(loss_val_test), \
        "accuracy:", "{:.4f}".format(acc_val_test)

  print "finished"

  if return_sess:
    return sess
  else:
    sess.close()
    return
示例#38
0
 def rnn_cell(self, reuse=False):
     return tf.nn.rnn_cell.GRUCell(
         self.rnn_size,
         kernel_initializer=tf.orthogonal_initializer(),
         reuse=reuse)
示例#39
0
    def __init__(self,seq_len=200,first_read=50,rnn_size=200):

        self.seq_len = seq_len
        self.first_read = first_read

        #dictionary of possible characters
        self.chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',\
                      '1','2','3','4','5','6','7','8','9','0','-','.',',','!','?','(',')','\'','"',' ']
        self.num_chars = len(self.chars)

        #dictionary mapping characters to indices
        self.char2idx = {char:i for (i,char) in enumerate(self.chars)}
        self.idx2char = {i:char for (i,char) in enumerate(self.chars)}

        '''
        #training portion of language model
        '''

        # input sequence of character indices
        # self.input = tf.placeholder(tf.int32,[1,seq_len])
        # tf Graph input
        x = tf.placeholder("float", [None, seq_max_len, 1])
        y = tf.placeholder("float", [None, n_classes])
        # A placeholder for indicating each sequence length
        seqlen = tf.placeholder(tf.int32, [None])

        #convert to one hot
        one_hot = tf.one_hot(self.input,self.num_chars)

        #rnn layer
        self.gru = GRUCell(rnn_size)
        outputs, states = tf.nn.dynamic_rnn(self.gru, one_hot,sequence_length=[seqlen],dtype=tf.float32)
        outputs = tf.squeeze(outputs,[0])

        #ignore all outputs during first read steps
        outputs = outputs[first_read:-1]

        #softmax logit to predict next character (actual softmax is applied in cross entropy function)
        logits = tf.layers.dense(outputs,self.num_chars,None,True,tf.orthogonal_initializer(),name='dense')

        #target character at each step (after first read chars) is following character
        targets = one_hot[0,first_read+1:]

        #loss and train functions
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=targets))
        self.optimizer = tf.train.AdamOptimizer(0.0002,0.9,0.999).minimize(self.loss)

        '''
        #generation portion of language model
        '''

        #use output and state from last word in training sequence
        state = tf.expand_dims(states[-1],0)
        output = one_hot[:,-1]

        #save predicted characters to list
        self.predictions = []

        #generate 100 new characters that come after input sequence
        for i in range(100):

            #run GRU cell and softmax
            output,state = self.gru(output,state)
            logits = tf.layers.dense(output,self.num_chars,None,True,tf.orthogonal_initializer(),name='dense',reuse=True)

            #get index of most probable character
            output = tf.argmax(tf.nn.softmax(logits),1)

            #save predicted character to list
            self.predictions.append(output)

            #one hot and cast to float for GRU API
            output = tf.cast(tf.one_hot(output,self.num_chars),tf.float32)

        #init op
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
示例#40
0
    def build_inference(self, reuse=False):
        """
        Build inference model for generating next states
        """

        inputs = {}
        outputs = {}

        video_feat = tf.placeholder(tf.float32, [None, self.options['video_feat_dim']], name='video_feat')
        sentence = tf.placeholder(tf.float32, [None, self.options['max_sentence_len'], self.options['word_embed_size']])
        sentence_mask = tf.placeholder(tf.float32, [None, None])

        if self.options['bidirectional_lstm_sentence']:
            sentence_bw = tf.placeholder(tf.float32,
                                         [None, self.options['max_sentence_len'], self.options['word_embed_size']])
            inputs['sentence_bw'] = sentence_bw

        video_c_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']])
        video_h_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']])

        interactor_c_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']])
        interactor_h_state = tf.placeholder(tf.float32, [None, self.options['rnn_size']])

        inputs['video_feat'] = video_feat
        inputs['sentence'] = sentence
        inputs['sentence_mask'] = sentence_mask
        inputs['video_c_state'] = video_c_state
        inputs['video_h_state'] = video_h_state
        inputs['interactor_c_state'] = interactor_c_state
        inputs['interactor_h_state'] = interactor_h_state

        video_state = tf.nn.rnn_cell.LSTMStateTuple(video_c_state, video_h_state)
        interactor_state = tf.nn.rnn_cell.LSTMStateTuple(interactor_c_state, interactor_h_state)

        batch_size = tf.shape(video_feat)[0]

        rnn_cell_sentence = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )
        rnn_cell_video = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )
        rnn_cell_interator = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )

        with tf.variable_scope('sentence_encoding', reuse=reuse) as sentence_scope:
            #sequence_length = tf.fill([batch_size, ], self.options['max_sentence_len'])
            sequence_length = tf.reduce_sum(sentence_mask, axis=-1)
            initial_state = rnn_cell_sentence.zero_state(batch_size=batch_size, dtype=tf.float32)

            sentence_states, sentence_final_state = tf.nn.dynamic_rnn(
                cell=rnn_cell_sentence,
                inputs=sentence,
                sequence_length=sequence_length,
                initial_state=initial_state,
                dtype=tf.float32
            )

            if self.options['bidirectional_lstm_sentence']:
                rnn_cell_sentence_bw = tf.contrib.rnn.LSTMCell(
                    num_units=self.options['rnn_size'],
                    state_is_tuple=True,
                    initializer=tf.orthogonal_initializer()
                )
                with tf.variable_scope('sentence_bw') as scope:
                    sentence_states_bw, sentence_final_state_bw = tf.nn.dynamic_rnn(
                        cell=rnn_cell_sentence_bw,
                        inputs=sentence_bw,
                        sequence_length=sequence_length,
                        initial_state=initial_state,
                        dtype=tf.float32
                    )
                    sentence_states_bw = tf.reverse_sequence(sentence_states_bw,
                                                             seq_lengths=tf.to_int32(sequence_length), seq_axis=1)
                sentence_states = tf.concat([sentence_states, sentence_states_bw], axis=-1)

        with tf.variable_scope('interactor', reuse=reuse) as interactor_scope:
            sentence_states_reshape = tf.reshape(sentence_states, [-1, (
                        1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']])

            # get video state
            with tf.variable_scope('video_rnn') as video_rnn_scope:
                _, video_state = rnn_cell_video(inputs=video_feat, state=video_state)

            video_c_state, video_h_state = video_state

            # calculate attention over words
            # use a one-layer network to do this
            with tf.variable_scope('word_attention', reuse=reuse) as attention_scope:
                h_states = tf.tile(tf.concat([interactor_h_state, video_h_state], axis=-1),
                                   [1, self.options['max_sentence_len']])
                h_states = tf.reshape(h_states, [-1, 2 * self.options['rnn_size']])

                attention_input = tf.concat([h_states, sentence_states_reshape], axis=-1)

                attention_layer1 = tf.contrib.layers.fully_connected(
                    inputs=attention_input,
                    num_outputs=self.options['attention_hidden_size'],
                    activation_fn=tf.nn.tanh,
                    weights_initializer=tf.contrib.layers.xavier_initializer()
                )
                attention_layer2 = tf.contrib.layers.fully_connected(
                    inputs=attention_layer1,
                    num_outputs=1,
                    activation_fn=None,
                    weights_initializer=tf.contrib.layers.xavier_initializer()
                )

            # reshape to match
            attention_reshape = tf.reshape(attention_layer2, [-1, self.options['max_sentence_len']])
            attention_score = tf.nn.softmax(attention_reshape, dim=-1)
            attention_score = tf.reshape(attention_score, [-1, 1, self.options['max_sentence_len']])

            # attended word feature
            attended_word_feature = tf.matmul(attention_score,
                                              sentence_states)  # already support batch matrix multiplication in v1.0
            attended_word_feature = tf.reshape(attended_word_feature, [-1, (
                        1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']])

            # calculate next interator state
            interactor_input = tf.concat([video_h_state, attended_word_feature], axis=-1)

            with tf.variable_scope('interactor_rnn') as interactor_rnn_scope:
                _, interactor_state = rnn_cell_interator(inputs=interactor_input, state=interactor_state)
            interactor_c_state, interactor_h_state = interactor_state

            with tf.variable_scope('predict_proposal'):
                logit_output = tf.contrib.layers.fully_connected(
                    inputs=interactor_h_state,
                    num_outputs=self.options['num_anchors'],
                    activation_fn=None
                )

                # score
                proposal_score = tf.sigmoid(logit_output, name='proposal_scores')

        outputs['proposal_score'] = proposal_score
        outputs['video_c_state'] = video_c_state
        outputs['video_h_state'] = video_h_state
        outputs['interactor_c_state'] = interactor_c_state
        outputs['interactor_h_state'] = interactor_h_state

        return inputs, outputs
示例#41
0
    def build_caption_greedy_inference(self, reuse=False):
        inputs = {}
        outputs = {}

        # proposal feature sequences (the localized proposals/events can be of different length, I set a 'max_proposal_len' to make it easy for GPU processing)
        proposal_feats = tf.placeholder(tf.float32, [
            None, self.options['max_proposal_len'],
            self.options['video_feat_dim']
        ])
        # combination of forward and backward hidden state, which encode event context information
        event_hidden_feats = tf.placeholder(
            tf.float32, [None, 2 * self.options['rnn_size']])

        inputs['event_hidden_feats'] = event_hidden_feats
        inputs['proposal_feats'] = proposal_feats

        # batch size for inference, depends on how many proposals are generated for a video
        eval_batch_size = tf.shape(proposal_feats)[0]

        # intialize the rnn cell for captioning
        rnn_cell_caption = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())

        def get_rnn_cell():
            return tf.contrib.rnn.LSTMCell(
                num_units=self.options['rnn_size'],
                state_is_tuple=True,
                initializer=tf.orthogonal_initializer())

        # multi-layer LSTM
        multi_rnn_cell_caption = tf.contrib.rnn.MultiRNNCell(
            [get_rnn_cell() for _ in range(self.options['num_rnn_layers'])],
            state_is_tuple=True)

        # start word
        word_id = tf.fill([eval_batch_size], self.options['vocab']['<START>'])
        word_id = tf.to_int64(word_id)
        word_ids = tf.expand_dims(word_id, axis=-1)

        # probability (confidence) for the predicted word
        word_confidences = tf.expand_dims(tf.fill([eval_batch_size], 1.),
                                          axis=-1)

        # initial state of caption generation
        initial_state = multi_rnn_cell_caption.zero_state(
            batch_size=eval_batch_size, dtype=tf.float32)
        state = initial_state

        with tf.variable_scope('caption_module', reuse=reuse) as caption_scope:

            # initialize memory cell and hidden output, note that the returned state is a tuple containing all states for each cell in MultiRNNCell
            state = multi_rnn_cell_caption.zero_state(
                batch_size=eval_batch_size, dtype=tf.float32)

            proposal_feats_reshape = tf.reshape(
                proposal_feats, [-1, self.options['video_feat_dim']],
                name='video_feat_reshape')

            ## the caption data should be prepared in equal length, namely, with length of 'caption_seq_len'
            ## use caption mask data to mask out loss from sequence after end of token (<END>)
            # only the first loop create variable, the other loops reuse them, need to give variable scope name to each variable, otherwise tensorflow will create a new one
            for i in range(self.options['caption_seq_len'] - 1):

                if i > 0:
                    caption_scope.reuse_variables()

                # word embedding
                word_embed = self.build_caption_embedding(word_id)

                # get attention, receive both hidden state information (previous generated words) and video feature
                # state[:, 1] return all hidden states for all cells in MultiRNNCell
                h_state = tf.concat([s[1] for s in state], axis=-1)
                h_state_tile = tf.tile(h_state,
                                       [1, self.options['max_proposal_len']])
                h_state_reshape = tf.reshape(h_state_tile, [
                    -1,
                    self.options['num_rnn_layers'] * self.options['rnn_size']
                ])

                # repeat to match each feature vector in the localized proposal
                event_hidden_feats_tile = tf.tile(
                    event_hidden_feats, [1, self.options['max_proposal_len']])
                event_hidden_feats_reshape = tf.reshape(
                    event_hidden_feats_tile,
                    [-1, 2 * self.options['rnn_size']])

                feat_state_concat = tf.concat([
                    proposal_feats_reshape, h_state_reshape,
                    event_hidden_feats_reshape
                ],
                                              axis=-1,
                                              name='feat_state_concat')
                #feat_state_concat = tf.concat([tf.reshape(tf.tile(word_embed, [1, self.options['max_proposal_len']]), [-1, self.options['word_embed_size']]), proposal_feats_reshape, h_state_reshape, event_hidden_feats_reshape], axis=-1, name='feat_state_concat')

                # use a two-layer network to model temporal soft attention over proposal feature sequence when predicting next word (dynamic)
                with tf.variable_scope('attention',
                                       reuse=reuse) as attention_scope:
                    attention_layer1 = tf.contrib.layers.fully_connected(
                        inputs=feat_state_concat,
                        num_outputs=self.options['attention_hidden_size'],
                        activation_fn=tf.nn.tanh,
                        weights_initializer=tf.contrib.layers.
                        xavier_initializer())
                    attention_layer2 = tf.contrib.layers.fully_connected(
                        inputs=attention_layer1,
                        num_outputs=1,
                        activation_fn=None,
                        weights_initializer=tf.contrib.layers.
                        xavier_initializer())

                # reshape to match
                attention_reshape = tf.reshape(
                    attention_layer2, [-1, self.options['max_proposal_len']],
                    name='attention_reshape')
                attention_score = tf.nn.softmax(attention_reshape,
                                                dim=-1,
                                                name='attention_score')
                attention = tf.reshape(
                    attention_score, [-1, 1, self.options['max_proposal_len']],
                    name='attention')

                # attended video feature
                attended_proposal_feat = tf.matmul(
                    attention, proposal_feats, name='attended_proposal_feat')
                attended_proposal_feat_reshape = tf.reshape(
                    attended_proposal_feat,
                    [-1, self.options['video_feat_dim']],
                    name='attended_proposal_feat_reshape')

                # whether to use proposal contexts to help generate the corresponding caption
                if self.options['no_context']:
                    proposal_feats_full = attended_proposal_feat_reshape
                else:
                    # whether to use gating function to combine the proposal contexts
                    if self.options['context_gating']:
                        # model a gate to weight each element of context and feature
                        attended_proposal_feat_reshape = tf.nn.tanh(
                            attended_proposal_feat_reshape)
                        with tf.variable_scope('context_gating', reuse=reuse):
                            '''
                            context_feats_transform = tf.contrib.layers.fully_connected(
                                inputs=event_hidden_feats,
                                num_outputs=self.options['video_feat_dim'],
                                activation_fn=None,
                                weights_initializer=tf.contrib.layers.xavier_initializer()
                            )
                            '''
                            context_feats_transform = event_hidden_feats

                            proposal_feats_transform = tf.contrib.layers.fully_connected(
                                inputs=attended_proposal_feat_reshape,
                                num_outputs=2 * self.options['rnn_size'],
                                activation_fn=tf.nn.tanh,
                                weights_initializer=tf.contrib.layers.
                                xavier_initializer())

                            gate = tf.contrib.layers.fully_connected(
                                inputs=tf.concat([
                                    word_embed, h_state,
                                    context_feats_transform,
                                    proposal_feats_transform
                                ],
                                                 axis=-1),
                                num_outputs=2 * self.options['rnn_size'],
                                activation_fn=tf.nn.sigmoid,
                                weights_initializer=tf.contrib.layers.
                                xavier_initializer())

                            gated_context_feats = tf.multiply(
                                context_feats_transform, gate)
                            gated_proposal_feats = tf.multiply(
                                proposal_feats_transform, 1. - gate)
                            proposal_feats_full = tf.concat(
                                [gated_context_feats, gated_proposal_feats],
                                axis=-1)

                    else:
                        proposal_feats_full = tf.concat([
                            event_hidden_feats, attended_proposal_feat_reshape
                        ],
                                                        axis=-1)

                # proposal feature embedded into word space
                proposal_feat_embed = self.build_video_feat_embedding(
                    proposal_feats_full)

                # get next state
                caption_output, state = multi_rnn_cell_caption(
                    tf.concat([proposal_feat_embed, word_embed], axis=-1),
                    state)

                # predict next word
                with tf.variable_scope('logits', reuse=reuse) as logits_scope:
                    logits = tf.contrib.layers.fully_connected(
                        inputs=caption_output,
                        num_outputs=self.options['vocab_size'],
                        activation_fn=None)

                softmax = tf.nn.softmax(logits, name='softmax')
                word_id = tf.argmax(softmax, axis=-1)
                word_confidence = tf.reduce_max(softmax, axis=-1)
                word_ids = tf.concat(
                    [word_ids, tf.expand_dims(word_id, axis=-1)], axis=-1)
                word_confidences = tf.concat([
                    word_confidences,
                    tf.expand_dims(word_confidence, axis=-1)
                ],
                                             axis=-1)

        #sentence_confidences = tf.reduce_sum(tf.log(tf.clip_by_value(word_confidences, 1e-20, 1.)), axis=-1)
        word_confidences = tf.log(tf.clip_by_value(word_confidences, 1e-20,
                                                   1.))

        outputs['word_ids'] = word_ids
        outputs['word_confidences'] = word_confidences

        return inputs, outputs
示例#42
0
    def build_train(self):
        """
        Build training model
        """

        inputs = {}
        outputs = {}

        video_feat = tf.placeholder(tf.float32, [None, None, self.options['video_feat_dim']], name='video_feat')
        video_feat_mask = tf.placeholder(tf.float32, [None, None])
        anchor_mask = tf.placeholder(tf.float32, [None, None, self.options['num_anchors']])
        sentence = tf.placeholder(tf.float32, [None, None, self.options['word_embed_size']])
        sentence_mask = tf.placeholder(tf.float32, [None, None])

        if self.options['bidirectional_lstm_sentence']:
            sentence_bw = tf.placeholder(tf.float32,
                                         [None, self.options['max_sentence_len'], self.options['word_embed_size']])
            inputs['sentence_bw'] = sentence_bw

        inputs['video_feat'] = video_feat
        inputs['video_feat_mask'] = video_feat_mask
        inputs['anchor_mask'] = anchor_mask
        inputs['sentence'] = sentence
        inputs['sentence_mask'] = sentence_mask

        ## proposal, densely annotated
        proposal = tf.placeholder(tf.int32, [None, None, self.options['num_anchors']], name='proposal')
        inputs['proposal'] = proposal

        ## weighting for positive/negative labels (solve imblance data problem)
        proposal_weight = tf.placeholder(tf.float32, [self.options['num_anchors'], 2], name='proposal_weight')
        inputs['proposal_weight'] = proposal_weight

        # fc dropout
        dropout = tf.placeholder(tf.float32)
        inputs['dropout'] = dropout

        # get batch size, which is a scalar tensor
        batch_size = tf.shape(video_feat)[0]

        rnn_cell_sentence = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )
        rnn_cell_video = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )
        rnn_cell_interator = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer()
        )

        rnn_cell_sentence = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_sentence,
            input_keep_prob=1.0 - dropout,
            output_keep_prob=1.0 - dropout
        )
        rnn_cell_video = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_video,
            input_keep_prob=1.0 - dropout,
            output_keep_prob=1.0 - dropout
        )
        rnn_cell_interator = tf.contrib.rnn.DropoutWrapper(
            rnn_cell_interator,
            input_keep_prob=1.0 - dropout,
            output_keep_prob=1.0 - dropout
        )

        with tf.variable_scope('sentence_encoding') as sentence_scope:
            #sequence_length = tf.fill([batch_size, ], self.options['max_sentence_len'])
            sequence_length = tf.reduce_sum(sentence_mask, axis=-1)
            initial_state = rnn_cell_sentence.zero_state(batch_size=batch_size, dtype=tf.float32)

            sentence_states, sentence_final_state = tf.nn.dynamic_rnn(
                cell=rnn_cell_sentence,
                inputs=sentence,
                sequence_length=sequence_length,
                initial_state=initial_state,
                dtype=tf.float32
            )

            if self.options['bidirectional_lstm_sentence']:
                rnn_cell_sentence_bw = tf.contrib.rnn.LSTMCell(
                    num_units=self.options['rnn_size'],
                    state_is_tuple=True,
                    initializer=tf.orthogonal_initializer()
                )
                with tf.variable_scope('sentence_bw') as scope:
                    sentence_states_bw, sentence_final_state_bw = tf.nn.dynamic_rnn(
                        cell=rnn_cell_sentence_bw,
                        inputs=sentence_bw,
                        sequence_length=sequence_length,
                        initial_state=initial_state,
                        dtype=tf.float32
                    )
                    sentence_states_bw = tf.reverse_sequence(sentence_states_bw,
                                                             seq_lengths=tf.to_int32(sequence_length), seq_axis=1)
                sentence_states = tf.concat([sentence_states, sentence_states_bw], axis=-1)

        logit_outputs = tf.fill([batch_size, 0, self.options['num_anchors']], 0.)

        with tf.variable_scope('interactor') as interactor_scope:
            interactor_state = rnn_cell_interator.zero_state(batch_size=batch_size, dtype=tf.float32)
            video_state = rnn_cell_video.zero_state(batch_size=batch_size, dtype=tf.float32)
            sentence_states_reshape = tf.reshape(sentence_states, [-1, (
                        1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']])
            for i in range(self.options['sample_len']):
                if i > 0:
                    interactor_scope.reuse_variables()

                # get video state
                with tf.variable_scope('video_rnn') as video_rnn_scope:
                    _, video_state = rnn_cell_video(inputs=video_feat[:, i, :], state=video_state)

                # calculate attention over words
                # use a one-layer network to do this
                with tf.variable_scope('word_attention') as attention_scope:
                    h_states = tf.tile(tf.concat([interactor_state[1], video_state[1]], axis=-1),
                                       [1, self.options['max_sentence_len']])
                    h_states = tf.reshape(h_states, [-1, 2 * self.options['rnn_size']])

                    attention_input = tf.concat([h_states, sentence_states_reshape], axis=-1)

                    attention_layer1 = tf.contrib.layers.fully_connected(
                        inputs=attention_input,
                        num_outputs=self.options['attention_hidden_size'],
                        activation_fn=tf.nn.tanh,
                        weights_initializer=tf.contrib.layers.xavier_initializer()
                    )
                    attention_layer2 = tf.contrib.layers.fully_connected(
                        inputs=attention_layer1,
                        num_outputs=1,
                        activation_fn=None,
                        weights_initializer=tf.contrib.layers.xavier_initializer()
                    )

                # reshape to match
                attention_reshape = tf.reshape(attention_layer2, [-1, self.options['max_sentence_len']])
                attention_score = tf.nn.softmax(attention_reshape, axis=-1)
                attention_score = tf.reshape(attention_score, [-1, 1, self.options['max_sentence_len']])

                # attended word feature
                attended_word_feature = tf.matmul(attention_score, sentence_states)
                attended_word_feature = tf.reshape(attended_word_feature, [-1, (
                            1 + int(self.options['bidirectional_lstm_sentence'])) * self.options['rnn_size']])

                # calculate next interator state
                interactor_input = tf.concat([video_state[1], attended_word_feature], axis=-1)

                with tf.variable_scope('interactor_rnn') as interactor_rnn_scope:
                    _, interactor_state = rnn_cell_interator(inputs=interactor_input, state=interactor_state)

                with tf.variable_scope('predict_proposal') as proposal_scope:
                    logit_output = tf.contrib.layers.fully_connected(
                        inputs=interactor_state[1],
                        num_outputs=self.options['num_anchors'],
                        activation_fn=None
                    )
                    logit_output = tf.expand_dims(logit_output, axis=1)
                    logit_outputs = tf.concat([logit_outputs, logit_output], axis=1)

        logit_outputs = tf.reshape(logit_outputs, [-1, self.options['num_anchors']])

        # weighting positive samples
        proposal_weight0 = tf.reshape(proposal_weight[:, 0], [-1, self.options['num_anchors']])
        # weighting negative samples
        proposal_weight1 = tf.reshape(proposal_weight[:, 1], [-1, self.options['num_anchors']])

        # tile
        proposal_weight0 = tf.tile(proposal_weight0, [tf.shape(logit_outputs)[0], 1])
        proposal_weight1 = tf.tile(proposal_weight1, [tf.shape(logit_outputs)[0], 1])

        # get weighted sigmoid xentropy loss
        # use tensorflow built-in function
        # weight1 will be always 1.
        proposal = tf.reshape(proposal, [-1, self.options['num_anchors']])
        proposal_loss_term = tf.nn.weighted_cross_entropy_with_logits(
            targets=tf.to_float(proposal), logits=logit_outputs, pos_weight=proposal_weight0)

        if self.options['anchor_mask']:
            proposal_loss_term = tf.reshape(anchor_mask, [-1, self.options['num_anchors']]) * proposal_loss_term

        proposal_loss_term = tf.reduce_sum(proposal_loss_term, axis=-1)
        proposal_loss_term = tf.reshape(proposal_loss_term, [-1])

        video_feat_mask = tf.reshape(video_feat_mask, [-1])
        proposal_loss = tf.reduce_sum((video_feat_mask * proposal_loss_term)) / tf.to_float(
            tf.reduce_sum(video_feat_mask))

        # summary data, for visualization using Tensorboard
        tf.summary.scalar('proposal_loss', proposal_loss)

        # outputs from proposal module
        outputs['loss'] = proposal_loss

        reg_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
        outputs['reg_loss'] = reg_loss

        return inputs, outputs
示例#43
0
    def build_proposal_inference(self, reuse=False):
        inputs = {}
        outputs = {}

        # this line of code is just a message to inform that batch size should be set to 1 only
        batch_size = 1

        #******************** Define Proposal Module ******************#

        ## dim1: batch, dim2: video sequence length, dim3: video feature dimension
        ## video feature sequence

        # forward
        video_feat_fw = tf.placeholder(
            tf.float32, [None, None, self.options['video_feat_dim']],
            name='video_feat_fw')
        inputs['video_feat_fw'] = video_feat_fw

        # backward
        video_feat_bw = tf.placeholder(
            tf.float32, [None, None, self.options['video_feat_dim']],
            name='video_feat_bw')
        inputs['video_feat_bw'] = video_feat_bw

        rnn_cell_video_fw = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())
        rnn_cell_video_bw = tf.contrib.rnn.LSTMCell(
            num_units=self.options['rnn_size'],
            state_is_tuple=True,
            initializer=tf.orthogonal_initializer())

        with tf.variable_scope('proposal_module',
                               reuse=reuse) as proposal_scope:
            '''video feature sequence encoding: forward pass
            '''
            with tf.variable_scope('video_encoder_fw', reuse=reuse) as scope:
                sequence_length = tf.expand_dims(tf.shape(video_feat_fw)[1],
                                                 axis=0)
                initial_state = rnn_cell_video_fw.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                rnn_outputs_fw, _ = tf.nn.dynamic_rnn(
                    cell=rnn_cell_video_fw,
                    inputs=video_feat_fw,
                    sequence_length=sequence_length,
                    initial_state=initial_state,
                    dtype=tf.float32)

            rnn_outputs_fw_reshape = tf.reshape(rnn_outputs_fw,
                                                [-1, self.options['rnn_size']],
                                                name='rnn_outputs_fw_reshape')

            # predict proposal at each time step: use fully connected layer to output scores for every anchors
            with tf.variable_scope('predict_proposal_fw',
                                   reuse=reuse) as scope:
                logit_output_fw = tf.contrib.layers.fully_connected(
                    inputs=rnn_outputs_fw_reshape,
                    num_outputs=self.options['num_anchors'],
                    activation_fn=None)
            '''video feature sequence encoding: backward pass
            '''
            with tf.variable_scope('video_encoder_bw', reuse=reuse) as scope:
                #sequence_length = tf.reduce_sum(video_feat_mask, axis=-1)
                sequence_length = tf.expand_dims(tf.shape(video_feat_bw)[1],
                                                 axis=0)
                initial_state = rnn_cell_video_bw.zero_state(
                    batch_size=batch_size, dtype=tf.float32)

                rnn_outputs_bw, _ = tf.nn.dynamic_rnn(
                    cell=rnn_cell_video_bw,
                    inputs=video_feat_bw,
                    sequence_length=sequence_length,
                    initial_state=initial_state,
                    dtype=tf.float32)

            rnn_outputs_bw_reshape = tf.reshape(rnn_outputs_bw,
                                                [-1, self.options['rnn_size']],
                                                name='rnn_outputs_bw_reshape')

            # predict proposal at each time step: use fully connected layer to output scores for every anchors
            with tf.variable_scope('predict_proposal_bw',
                                   reuse=reuse) as scope:
                logit_output_bw = tf.contrib.layers.fully_connected(
                    inputs=rnn_outputs_bw_reshape,
                    num_outputs=self.options['num_anchors'],
                    activation_fn=None)

        # score
        proposal_score_fw = tf.sigmoid(logit_output_fw,
                                       name='proposal_score_fw')
        proposal_score_bw = tf.sigmoid(logit_output_bw,
                                       name='proposal_score_bw')

        # outputs from proposal module
        outputs['proposal_score_fw'] = proposal_score_fw
        outputs['proposal_score_bw'] = proposal_score_bw
        outputs['rnn_outputs_fw'] = rnn_outputs_fw_reshape
        outputs['rnn_outputs_bw'] = rnn_outputs_bw_reshape

        return inputs, outputs
示例#44
0
文件: model.py 项目: haozijie/RAM
    def build_model(self):
        with tf.name_scope('inputs'):
            self.sentences = tf.placeholder(tf.int32, [None, self.max_sentence_len])
            self.aspects = tf.placeholder(tf.int32, [None, self.max_aspect_len])
            self.sentence_lens = tf.placeholder(tf.int32, None)
            self.sentence_locs = tf.placeholder(tf.float32, [None, self.max_sentence_len])
            self.labels = tf.placeholder(tf.int32, [None, self.n_class])
            self.dropout_keep_prob = tf.placeholder(tf.float32)
            
            inputs = tf.nn.embedding_lookup(self.word2vec, self.sentences)
            inputs = tf.cast(inputs, tf.float32)
            inputs = tf.nn.dropout(inputs, keep_prob=self.dropout_keep_prob)
            aspect_inputs = tf.nn.embedding_lookup(self.word2vec, self.aspects)
            aspect_inputs = tf.cast(aspect_inputs, tf.float32)
            aspect_inputs = tf.reduce_mean(aspect_inputs, 1)

        with tf.name_scope('weights'):
            weights = {
                'attention': tf.get_variable(
                    name='W_al',
                    shape=[self.n_hop, 1, self.n_hidden * 3 + self.embedding_dim + 1],
                    initializer=tf.contrib.layers.xavier_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'gru_r': tf.get_variable(
                    name='W_r',
                    shape=[self.n_hidden, self.n_hidden * 2 + 1],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'gru_z': tf.get_variable(
                    name='W_z',
                    shape=[self.n_hidden, self.n_hidden * 2 + 1],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'gru_g': tf.get_variable(
                    name='W_g',
                    shape=[self.n_hidden, self.n_hidden],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'gru_x': tf.get_variable(
                    name='W_x',
                    shape=[self.n_hidden, self.n_hidden * 2 + 1],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'softmax': tf.get_variable(
                    name='W_l',
                    shape=[self.n_hidden, self.n_class],
                    initializer=tf.contrib.layers.xavier_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
            }
        
        with tf.name_scope('biases'):
            biases = {
                'attention': tf.get_variable(
                    name='B_al',
                    shape=[self.n_hop, 1, self.max_sentence_len],
                    initializer=tf.zeros_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'softmax': tf.get_variable(
                    name='B_l',
                    shape=[self.n_class],
                    initializer=tf.zeros_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
            }
            
        with tf.name_scope('updates'):
            updates = {
                'gru_r': tf.get_variable(
                    name='U_r',
                    shape=[self.n_hidden, self.n_hidden],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
                'gru_z': tf.get_variable(
                    name='U_z',
                    shape=[self.n_hidden, self.n_hidden],
                    initializer=tf.orthogonal_initializer(),
                    regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)
                ),
            }

        with tf.name_scope('dynamic_rnn'):
            lstm_cell_fw = tf.contrib.rnn.LSTMCell(
                self.n_hidden,
                initializer=tf.orthogonal_initializer(),
            )
            lstm_cell_bw = tf.contrib.rnn.LSTMCell(
                self.n_hidden,
                initializer=tf.orthogonal_initializer(),
            )
            outputs, state, _ = tf.nn.static_bidirectional_rnn(
                lstm_cell_fw,
                lstm_cell_bw,
                tf.unstack(tf.transpose(inputs, perm=[1, 0, 2])),
                sequence_length=self.sentence_lens,
                dtype=tf.float32,
                scope='BiLSTM'
            )
            outputs = tf.reshape(tf.concat(outputs, 1), [-1, self.max_sentence_len, self.n_hidden * 2])
            batch_size = tf.shape(outputs)[0]

            outputs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False)
            outputs_iter = outputs_iter.unstack(outputs)
            sentence_locs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False)
            sentence_locs_iter = sentence_locs_iter.unstack(self.sentence_locs)
            sentence_lens_iter = tf.TensorArray(tf.int32, 1, dynamic_size=True, infer_shape=False)
            sentence_lens_iter = sentence_lens_iter.unstack(self.sentence_lens)
            memory = tf.TensorArray(size=batch_size, dtype=tf.float32)
            def body(i, memory):
                a = outputs_iter.read(i)
                b = sentence_locs_iter.read(i)
                c = sentence_lens_iter.read(i)
                weight = 1 - b
                memory = memory.write(i, tf.concat([tf.multiply(a, tf.tile(tf.expand_dims(weight, -1), [1, self.n_hidden * 2])), tf.reshape(b, [-1, 1])], 1))
                return (i + 1, memory)
            def condition(i, memory):
                return i < batch_size
            _, memory_final = tf.while_loop(cond=condition, body=body, loop_vars=(0, memory))
            self.memories = tf.reshape(memory_final.stack(), [-1, self.max_sentence_len, self.n_hidden * 2 + 1])

            e = tf.zeros([batch_size, self.n_hidden])
            scores_list = []
            aspect_inputs = tf.tile(tf.expand_dims(aspect_inputs, 1), [1, self.max_sentence_len, 1])
            for h in range(self.n_hop):
                memories_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False)
                memories_iter = memories_iter.unstack(self.memories)
                e_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False)
                e_iter = e_iter.unstack(e)
                aspect_inputs_iter = tf.TensorArray(tf.float32, 1, dynamic_size=True, infer_shape=False)
                aspect_inputs_iter = aspect_inputs_iter.unstack(aspect_inputs)
                sentence_lens_iter = tf.TensorArray(tf.int32, 1, dynamic_size=True, infer_shape=False)
                sentence_lens_iter = sentence_lens_iter.unstack(self.sentence_lens)
                newe = tf.TensorArray(size=batch_size, dtype=tf.float32)
                score = tf.TensorArray(size=batch_size, dtype=tf.float32)
                def body(i, newe, score):
                    a = memories_iter.read(i)
                    olde = e_iter.read(i)
                    b = tf.tile(tf.expand_dims(olde, 0), [self.max_sentence_len, 1])
                    c = aspect_inputs_iter.read(i)
                    l = math_ops.to_int32(sentence_lens_iter.read(i))
                    g = tf.matmul(weights['attention'][h], tf.transpose(tf.concat([a, b, c], 1), perm=[1, 0])) + biases['attention'][h]
                    score_temp = tf.concat([tf.nn.softmax(tf.slice(g, [0, 0], [1, l])), tf.zeros([1, self.max_sentence_len - l])], 1)
                    score = score.write(i, score_temp)
                    i_AL = tf.reshape(tf.matmul(score_temp, a), [-1, 1])
                    olde = tf.reshape(olde, [-1, 1])
                    r = tf.nn.sigmoid(tf.matmul(weights['gru_r'], i_AL) + tf.matmul(updates['gru_r'], olde))
                    z = tf.nn.sigmoid(tf.matmul(weights['gru_z'], i_AL) + tf.matmul(updates['gru_z'], olde))
                    e0 = tf.nn.tanh(tf.matmul(weights['gru_x'], i_AL) + tf.matmul(weights['gru_g'], tf.multiply(r, olde)))
                    newe_temp = tf.multiply(1 - z, olde) + tf.multiply(z, e0)
                    newe = newe.write(i, newe_temp)
                    return (i + 1, newe, score)
                def condition(i, newe, score):
                    return i < batch_size
                _, newe_final, score_final = tf.while_loop(cond=condition, body=body, loop_vars=(0, newe, score))
                e = tf.reshape(newe_final.stack(), [-1, self.n_hidden])
                batch_score = tf.reshape(score_final.stack(), [-1, self.max_sentence_len])
                scores_list.append(batch_score)
            self.scores = tf.transpose(tf.reshape(tf.stack(scores_list), [self.n_hop, -1, self.max_sentence_len]), [1, 0, 2])
            self.predict = tf.matmul(e, weights['softmax']) + biases['softmax']

        with tf.name_scope('loss'):
            self.cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = self.predict, labels = self.labels))
            self.global_step = tf.Variable(0, name="tr_global_step", trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost, global_step=self.global_step)

        with tf.name_scope('predict'):
            self.predict_label = tf.argmax(self.predict, 1)
            self.correct_pred = tf.equal(self.predict_label, tf.argmax(self.labels, 1))
            self.accuracy = tf.reduce_sum(tf.cast(self.correct_pred, tf.int32))
            
        summary_loss = tf.summary.scalar('loss', self.cost)
        summary_acc = tf.summary.scalar('acc', self.accuracy)
        self.train_summary_op = tf.summary.merge([summary_loss, summary_acc])
        self.test_summary_op = tf.summary.merge([summary_loss, summary_acc])
        _dir = 'logs/' + str(self.timestamp) + '_r' + str(self.learning_rate) + '_b' + str(self.batch_size) + '_l' + str(self.l2_reg)
        self.train_summary_writer = tf.summary.FileWriter(_dir + '/train', self.sess.graph)
        self.test_summary_writer = tf.summary.FileWriter(_dir + '/test', self.sess.graph)
示例#45
0
    def __init__(self,
                 embedding_matrix,
                 num_classes,
                 max_sents,
                 max_words,
                 rnn_type="gru",
                 rnn_units=50,
                 attention_size=200,
                 dropout_keep=1.0):
        '''
        hierarchical convolutional attention network for text classification
        
        parameters:
          - embedding_matrix: numpy array
            numpy array of word embeddings
            each row should represent a word embedding
            NOTE: the word index 0 is dropped, so the first row is ignored
          - num_classes: int
            number of output classes
          - max_sents: int
            maximum number of sentences per document
          - max_words: int
            maximum number of words per sentence
          - rnn_type: string (default: "gru")
            rnn cells to use, can be "gru" or "lstm"
          - rnn_units: int (default: 50)
            number of rnn units to use for embedding layers
          - attention_size: int (default: 200)
            number of dimensions to use for attention hidden layer
          - dropout_keep: float (default: 1.0)
            dropout keep rate RNNs
           
        methods:
          - train(,data,labels,validation_data,epochs=30,savebest=False,filepath=None)
            train network on given data
          - predict(data)
            return the one-hot-encoded predicted labels for given data
          - score(data,labels)
            return the accuracy of predicted labels on given data
          - save(filepath)
            save the model weights to a file
          - load(filepath)
            load model weights from a file
        '''

        self.rnn_units = rnn_units
        if rnn_type == "gru":
            self.rnn_cell = GRUCell
        elif rnn_type == "lstm":
            self.rnn_cell = LSTMCell
        else:
            raise Exception("rnn_type parameter must be set to gru or lstm")
        self.dropout_keep = dropout_keep
        self.dropout = tf.placeholder(tf.float32)
        self.ms = max_sents
        self.mw = max_words

        #doc input and mask
        self.doc_input = tf.placeholder(tf.int32, shape=[max_sents, max_words])
        words_per_line = tf.reduce_sum(tf.sign(self.doc_input), 1)
        num_lines = tf.reduce_sum(tf.sign(words_per_line))
        max_words_ = tf.reduce_max(words_per_line)
        doc_input_reduced = self.doc_input[:num_lines, :max_words_]
        num_words = words_per_line[:num_lines]

        #word rnn layer
        word_embeds = tf.gather(
            tf.get_variable('embeddings',
                            initializer=embedding_matrix.astype(np.float32),
                            dtype=tf.float32), doc_input_reduced)
        with tf.variable_scope('words'):
            [word_outputs_fw,word_outputs_bw],_ = \
                    tf.nn.bidirectional_dynamic_rnn(
                    tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout),
                    tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout),
                    word_embeds,sequence_length=num_words,dtype=tf.float32)
        word_outputs = tf.concat((word_outputs_fw, word_outputs_bw), 2)

        #word attention
        seq_mask = tf.reshape(tf.sequence_mask(num_words, max_words_), [-1])
        word_u = tf.layers.dense(
            tf.reshape(word_outputs, [-1, self.rnn_units * 2]),
            attention_size,
            tf.nn.tanh,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        word_exps = tf.layers.dense(
            word_u,
            1,
            tf.exp,
            False,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        word_exps = tf.where(seq_mask, word_exps,
                             tf.ones_like(word_exps) * 0.000000001)
        word_alpha = tf.reshape(word_exps, [-1, max_words_, 1])
        word_alpha /= tf.reshape(tf.reduce_sum(word_alpha, 1), [-1, 1, 1])
        sent_embeds = tf.reduce_sum(word_outputs * word_alpha, 1)
        sent_embeds = tf.expand_dims(sent_embeds, 0)

        #sentence rnn layer
        with tf.variable_scope('sentence'):
            [sent_outputs_fw,sent_outputs_bw],_ = \
                    tf.nn.bidirectional_dynamic_rnn(
                    tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout),
                    tf.contrib.rnn.DropoutWrapper(self.rnn_cell(self.rnn_units),state_keep_prob=self.dropout),
                    sent_embeds,sequence_length=tf.expand_dims(num_lines,0),dtype=tf.float32)
        sent_outputs = tf.concat(
            (tf.squeeze(sent_outputs_fw, [0]), tf.squeeze(
                sent_outputs_bw, [0])), 1)

        #sentence attention
        sent_u = tf.layers.dense(
            sent_outputs,
            attention_size,
            tf.nn.tanh,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        sent_exp = tf.layers.dense(
            sent_u,
            1,
            tf.exp,
            False,
            kernel_initializer=tf.contrib.layers.xavier_initializer())
        sent_atten = sent_exp / tf.reduce_sum(sent_exp)
        doc_embed = tf.transpose(
            tf.matmul(tf.transpose(sent_outputs), sent_atten))

        #classification functions
        logits = tf.layers.dense(
            doc_embed,
            num_classes,
            kernel_initializer=tf.orthogonal_initializer())
        self.prediction = tf.nn.softmax(logits)

        #loss, accuracy, and training functions
        self.labels = tf.placeholder(tf.float32, shape=[num_classes])
        labels_ = tf.expand_dims(self.labels, 0)
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=labels_))
        self.optimizer = tf.train.AdamOptimizer(0.00002, 0.9,
                                                0.99).minimize(self.loss)

        #init op
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
示例#46
0
 def testInvalidShape(self):
   init1 = tf.orthogonal_initializer()
   with self.test_session(graph=tf.Graph(), use_gpu=True):
     self.assertRaises(ValueError, init1, shape=[5])
示例#47
0
    def __init__(self, data, training=False):
        self.data = data
        self.initializer = tf.orthogonal_initializer()
        q_mask = make_mask(self.data.ql, 25)  # (1, L_q, E)
        s_mask = make_mask(self.data.sl, 29)  # (N, L_s, E)
        a_mask = make_mask(self.data.al, 34)  # (5, L_a, E)

        ques_shape = tf.shape(q_mask)
        subt_shape = tf.shape(s_mask)
        ans_shape = tf.shape(a_mask)

        with tf.variable_scope('Embedding'):
            self.embedding = tf.get_variable('embedding_matrix',
                                             initializer=np.load(
                                                 _mp.embedding_file),
                                             trainable=False)

            self.ques = tf.nn.embedding_lookup(self.embedding,
                                               self.data.ques)  # (1, L_q, E)
            self.ans = tf.nn.embedding_lookup(self.embedding,
                                              self.data.ans)  # (5, L_a, E)
            self.subt = tf.nn.embedding_lookup(self.embedding,
                                               self.data.subt)  # (N, L_s, E)

            # self.ques = dropout(self.ques, training=training)  # (1, L_q, E)
            # self.ans = dropout(self.ans, training=training)  # (5, L_a, E)
            # self.subt = dropout(self.subt, training=training)  # (N, L_s, E)

        with tf.variable_scope('Embedding_Linear'):
            # (1, L_q, E_t)
            self.ques_embedding = unit_norm(
                mask_dense(self.ques, q_mask, reuse=False))
            # (5, L_a, E_t)
            self.ans_embedding = unit_norm(mask_dense(self.ans, a_mask))
            # (N, L_s, E_t)
            self.subt_embedding = unit_norm(mask_dense(self.subt, s_mask))

        with tf.variable_scope('Language_Encode'):
            mask = tf.expand_dims(tf.sequence_mask(self.data.ql, 25), axis=-1)
            # (1, E_t)
            self.ques_enc = unit_norm(conv_encode(self.ques_embedding, mask,
                                                  'ques'),
                                      dim=1)
            mask = tf.expand_dims(tf.sequence_mask(self.data.al, 34), axis=-1)
            # (5, E_t)
            self.ans_enc = unit_norm(conv_encode(self.ans_embedding, mask,
                                                 'ans'),
                                     dim=1)
            mask = tf.expand_dims(tf.sequence_mask(self.data.sl, 29), axis=-1)
            # (N, E_t)
            self.subt_enc = unit_norm(conv_encode(self.subt_embedding, mask,
                                                  'subt'),
                                      dim=1)

        with tf.variable_scope('Temporal_Attention'):
            # (N, 2 * E_t)
            self.temp_attn = tf.concat(
                [self.subt_enc,
                 tf.tile(self.ques_enc, [subt_shape[0], 1])],
                axis=-1)
            # (1, N, E_t)
            self.temp_attn = unit_norm(tf.expand_dims(self.temp_attn, axis=0))
            # (1, N, 1)
            self.temp_attn = tf.layers.conv1d(self.temp_attn,
                                              1,
                                              5,
                                              padding='same',
                                              activation=tf.nn.relu)
            # (N, 1)
            self.temp_attn = tf.squeeze(tf.nn.softmax(self.temp_attn, axis=1),
                                        axis=0)

            nth = nn.nth_element(tf.transpose(self.temp_attn),
                                 tf.cast(subt_shape[0] / 2, tf.int32), True)
            # (N, 1)
            attn_mask = tf.greater_equal(self.temp_attn, nth)
            self.subt_enc = self.temp_attn * tf.cast(attn_mask, tf.float32)
            self.subt_enc = self.subt_enc * self.temp_attn
        self.summarize = unit_norm(tf.reduce_sum(self.subt_enc,
                                                 axis=0,
                                                 keepdims=True),
                                   dim=1)  # (1, 4 * E_t)

        # gamma = tf.get_variable('gamma', [1, 1], initializer=tf.zeros_initializer)
        #
        # self.ans_vec = self.summarize * tf.nn.sigmoid(gamma) + \
        #                tf.squeeze(self.ques_enc, axis=0) * (1 - tf.nn.sigmoid(gamma))

        self.ans_vec = unit_norm(self.summarize + self.ques_enc,
                                 dim=1)  # (1, 4 * E_t)

        self.output = tf.matmul(self.ans_vec, self.ans_enc,
                                transpose_b=True)  # (1, 5)
示例#48
0
    def _build_net(self):

        with tf.variable_scope("Actor" + self.suffix):

            with tf.name_scope('inputs' + self.suffix):
                self.tf_obs = tf.placeholder(tf.float32,
                                             [None, self.n_features],
                                             name='observation' + self.suffix)
                self.tf_acts = tf.placeholder(tf.int32, [
                    None,
                ],
                                              name='actions_num' + self.suffix)
                self.tf_vt = tf.placeholder(tf.float32, [
                    None,
                ],
                                            name='actions_value' + self.suffix)
                self.tf_safe = tf.placeholder(tf.float32, [
                    None,
                ],
                                              name='safety_value' +
                                              self.suffix)
                self.entropy_weight = tf.placeholder(
                    tf.float32,
                    shape=(),
                    name='entropy_weight_clustering' + self.suffix)

                ##### PPO change #####
                self.ppo_ratio = tf.placeholder(tf.float32, [
                    None,
                ],
                                                name='ppo_ratio' + self.suffix)
                ##### PPO change #####

            layer = tf.layers.dense(
                inputs=self.tf_obs,
                units=128,
                activation=tf.nn.tanh,
                # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
                kernel_initializer=tf.orthogonal_initializer(
                    gain=np.sqrt(2.)),  # ppo default initialization
                bias_initializer=tf.constant_initializer(0.1),
                name='fc1' + self.suffix)

            all_act = tf.layers.dense(
                inputs=layer,
                units=self.n_actions,
                activation=None,
                # kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.3),
                kernel_initializer=tf.orthogonal_initializer(
                    gain=np.sqrt(2.)),  # ppo default initialization
                bias_initializer=tf.constant_initializer(0.1),
                name='fc2' + self.suffix)

            self.trainable_variables = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope='Actor' + self.suffix)
            self.trainable_variables_shapes = [
                var.get_shape().as_list() for var in self.trainable_variables
            ]

            # sampling
            self.all_act_prob = tf.nn.softmax(all_act,
                                              name='act_prob' + self.suffix)
            self.all_act_prob = tf.clip_by_value(self.all_act_prob, 1e-20, 1.0)

            with tf.name_scope('loss' + self.suffix):
                neg_log_prob = tf.reduce_sum(
                    -tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) *
                    tf.one_hot(indices=self.tf_acts, depth=self.n_actions),
                    axis=1)
                loss = tf.reduce_mean(neg_log_prob * self.tf_vt)
                loss += self.entropy_weight * tf.reduce_mean(
                    tf.reduce_sum(
                        tf.log(tf.clip_by_value(self.all_act_prob, 1e-30,
                                                1.0)) * self.all_act_prob,
                        axis=1))
                self.entro = self.entropy_weight * tf.reduce_mean(
                    tf.reduce_sum(
                        tf.log(tf.clip_by_value(self.all_act_prob, 1e-30,
                                                1.0)) * self.all_act_prob,
                        axis=1))
                self.loss = loss
            with tf.name_scope('train' + self.suffix):
                self.train_op = tf.train.AdamOptimizer(self.lr).minimize(loss)

            # safety loss
            """
            * -1?
            """
            self.chosen_action_log_probs = tf.reduce_sum(
                tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) *
                tf.one_hot(indices=self.tf_acts, depth=self.n_actions),
                axis=1)
            ##### PPO CHANGE #####
            self.ppo_old_chosen_action_log_probs = tf.placeholder(
                tf.float32, [None])
            ##### PPO CHANGE #####
            self.old_chosen_action_log_probs = tf.stop_gradient(
                tf.placeholder(tf.float32, [None]))
            # self.each_safety_loss = tf.exp(self.chosen_action_log_probs - self.old_chosen_action_log_probs) * self.tf_safe
            self.each_safety_loss = (
                tf.exp(self.chosen_action_log_probs) -
                tf.exp(self.old_chosen_action_log_probs)) * self.tf_safe
            self.average_safety_loss = tf.reduce_mean(
                self.each_safety_loss)  #/ self.n_episodes tf.reduce_sum
            # self.average_safety_loss +=self.entro

            # KL D
            self.old_all_act_prob = tf.stop_gradient(
                tf.placeholder(tf.float32, [None, self.n_actions]))

            def kl(x, y):
                EPS = 1e-10
                x = tf.where(tf.abs(x) < EPS, EPS * tf.ones_like(x), x)
                y = tf.where(tf.abs(y) < EPS, EPS * tf.ones_like(y), y)
                X = tf.distributions.Categorical(probs=x + EPS)
                Y = tf.distributions.Categorical(probs=y + EPS)
                return tf.distributions.kl_divergence(X,
                                                      Y,
                                                      allow_nan_stats=False)

            self.each_kl_divergence = kl(
                self.all_act_prob, self.old_all_act_prob
            )  # tf.reduce_sum(kl(self.all_act_prob, self.old_all_act_prob), axis=1)
            self.average_kl_divergence = tf.reduce_mean(
                self.each_kl_divergence)
            # self.kl_gradients = tf.gradients(self.average_kl_divergence, self.trainable_variables)  # useless

            self.desired_kl = desired_kl
            # self.metrics = [self.loss, self.average_kl_divergence, self.average_safety_loss, self.entro] # Luping
            self.metrics = [
                self.loss, self.loss, self.average_safety_loss, self.entro
            ]  # Luping

            # FLat
            self.flat_params_op = get_flat_params(self.trainable_variables)
            """not use tensorflow default function, here we calculate the gradient by self:
            (1) loss: g
            (2) kl: directional_gradients (math, fisher)
            (3) safe: b 
            """
            ##### PPO change #####
            #### PPO Suyi's Change ####
            with tf.name_scope('ppoloss' + self.suffix):
                self.ppo_ratio = tf.exp(self.chosen_action_log_probs -
                                        self.ppo_old_chosen_action_log_probs)
                # self.ppo_ratio = tf.Print(self.ppo_ratio, [self.ppo_ratio], "self.ppo_ratio: ")

                surr = self.ppo_ratio * self.tf_vt
                self.ppoloss = -tf.reduce_mean(
                    tf.minimum(
                        surr,
                        tf.clip_by_value(self.ppo_ratio, 1. - self.clip_eps,
                                         1. + self.clip_eps) * self.tf_vt))

                self.ppoloss += self.entropy_weight * tf.reduce_mean(
                    tf.reduce_sum(
                        tf.log(tf.clip_by_value(self.all_act_prob, 1e-30,
                                                1.0)) * self.all_act_prob,
                        axis=1))
                # self.ppoloss += 0.01 * tf.reduce_mean(tf.reduce_sum(tf.log(tf.clip_by_value(self.all_act_prob, 1e-30, 1.0)) * self.all_act_prob, axis=1))

            with tf.variable_scope('ppotrain'):
                # self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(self.ppoloss)
                self.atrain_op = tf.train.AdamOptimizer(self.lr).minimize(
                    self.ppoloss)
            #### PPO Suyi's Change ####

            self.ppoloss_flat_gradients_op = get_flat_gradients(
                self.ppoloss, self.trainable_variables)
            ##### PPO change #####

            self.loss_flat_gradients_op = get_flat_gradients(
                self.loss, self.trainable_variables)
            self.kl_flat_gradients_op = get_flat_gradients(
                self.average_kl_divergence, self.trainable_variables)
            self.constraint_flat_gradients_op = get_flat_gradients(
                self.average_safety_loss, self.trainable_variables)

            self.vec = tf.placeholder(tf.float32, [None])
            self.fisher_product_op = self.get_fisher_product_op()

            self.new_params = tf.placeholder(tf.float32, [None])
            self.params_assign_op = assign_network_params_op(
                self.new_params, self.trainable_variables,
                self.trainable_variables_shapes)
示例#49
0
 def GRU(self, rnn_size=None, reuse=None):
     rnn_size = args.hidden_size if rnn_size is None else rnn_size
     return tf.nn.rnn_cell.GRUCell(
         rnn_size,
         kernel_initializer=tf.orthogonal_initializer(),
         reuse=reuse)
示例#50
0
    def __init__(self,embedding_matrix,num_classes,max_sents,max_words,attention_heads=8,
                 attention_size=512,dropout_keep=0.9,activation=tf.nn.elu):
        '''
        hierarchical convolutional attention network for text classification
        
        parameters:
          - embedding_matrix: numpy array
            numpy array of word embeddings
            each row should represent a word embedding
            NOTE: the word index 0 is dropped, so the first row is ignored
          - num_classes: int
            number of output classes
          - max_sents: int
            maximum number of sentences per document
          - max_words: int
            maximum number of words per sentence
          - attention_heads: int (default: 8)
            number of attention heads to use in multihead attention
          - attention_size: int (default: 512)
            dimension size of output embeddings from attention 
          - dropout_keep: float (default: 0.9)
            dropout keep rate for embeddings and attention softmax
          - activation: tensorflow activation function (default: tf.nn.elu)
            activation function to use for convolutional feature extraction
           
        methods:
          - train(,data,labels,validation_data,epochs=30,savebest=False,filepath=None)
            train network on given data
          - predict(data)
            return the one-hot-encoded predicted labels for given data
          - score(data,labels)
            return the accuracy of predicted labels on given data
          - save(filepath)
            save the model weights to a file
          - load(filepath)
            load model weights from a file
        '''
    
        self.attention_heads = attention_heads
        self.attention_size = attention_size
        self.embedding_size = embedding_matrix.shape[1]
        self.embeddings = embedding_matrix.astype(np.float32)
        self.ms = max_sents
        self.mw = max_words
        self.dropout_keep = dropout_keep
        self.dropout = tf.placeholder(tf.float32)
                
        #doc input and mask
        self.doc_input = tf.placeholder(tf.int32, shape=[max_sents,max_words])
        self.words_per_line = tf.reduce_sum(tf.sign(self.doc_input),1)
        self.max_lines = tf.reduce_sum(tf.sign(self.words_per_line))
        self.max_words = tf.reduce_max(self.words_per_line)
        self.doc_input_reduced = self.doc_input[:self.max_lines,:self.max_words]
        self.num_words = self.words_per_line[:self.max_lines]
        
        #word embeddings
        self.word_embeds = tf.gather(tf.get_variable('embeddings',initializer=self.embeddings,
                           dtype=tf.float32),self.doc_input_reduced)
        positions = tf.expand_dims(tf.range(self.max_words),0)
        word_pos = tf.gather(tf.get_variable('word_pos',shape=(self.mw,self.embedding_size), 
                   dtype=tf.float32,initializer=tf.random_normal_initializer(0,0.1)),positions)              
        self.word_embeds = tf.nn.dropout(self.word_embeds + word_pos,self.dropout)

        #for feature/parameter comparison
        print(self)
        print(f"attention heads: {attention_heads}")
        print(f"attention size: {attention_size}")
        print(f"self embedding size: {self.embedding_size}")
        print(f"self embeddings: {self.embeddings}")
        print(f"max sents (ms): {self.ms}")
        print(f"max words (mw): {self.mw}")
        print(f"dropout: {dropout_keep}")

        print(f"self doc_input: {self.doc_input}")
        print(f"self words_per_line: {self.words_per_line}")
        print(f"self max_lines {self.max_lines}")
        print(f"self max_words {self.max_words}")
        print(f"self doc_input_reduced: {self.doc_input_reduced}")
        print(f"self num_words: {self.num_words}")

        #masks to eliminate padding
        mask_base = tf.cast(tf.sequence_mask(self.num_words,self.max_words),tf.float32)
        mask = tf.tile(tf.expand_dims(mask_base,2),[1,1,self.attention_size])
        mask2 = tf.tile(tf.expand_dims(mask_base,2),[self.attention_heads,1,self.max_words])
        print(f"mask_base: {mask_base}")
        print(f"mask: {mask}")
        print(f"mask2: {mask2}")

        #word self attention 1
        Q1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        K1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        V1 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        
        Q1 = tf.where(tf.equal(mask,0),tf.zeros_like(Q1),Q1)
        K1 = tf.where(tf.equal(mask,0),tf.zeros_like(K1),K1)
        V1 = tf.where(tf.equal(mask,0),tf.zeros_like(V1),V1)
        
        Q1_ = tf.concat(tf.split(Q1,self.attention_heads,axis=2),axis=0)
        K1_ = tf.concat(tf.split(K1,self.attention_heads,axis=2),axis=0)
        V1_ = tf.concat(tf.split(V1,self.attention_heads,axis=2),axis=0)
        
        outputs1 = tf.matmul(Q1_,tf.transpose(K1_,[0, 2, 1]))
        outputs1 = outputs1/(K1_.get_shape().as_list()[-1]**0.5)
        outputs1 = tf.where(tf.equal(outputs1,0),tf.ones_like(outputs1)*-1000,outputs1)
        outputs1 = tf.nn.dropout(tf.nn.softmax(outputs1),self.dropout)
        outputs1 = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs1),outputs1)
        outputs1 = tf.matmul(outputs1,V1_)
        outputs1 = tf.concat(tf.split(outputs1,self.attention_heads,axis=0),axis=2)
        outputs1 = tf.where(tf.equal(mask,0),tf.zeros_like(outputs1),outputs1)
        
        #word self attention 2
        Q2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        K2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        V2 = tf.layers.conv1d(self.word_embeds,self.attention_size,3,padding='same',
            activation=tf.nn.tanh,kernel_initializer=tf.orthogonal_initializer())
        
        Q2 = tf.where(tf.equal(mask,0),tf.zeros_like(Q2),Q2)
        K2 = tf.where(tf.equal(mask,0),tf.zeros_like(K2),K2)
        V2 = tf.where(tf.equal(mask,0),tf.zeros_like(V2),V2)
        
        Q2_ = tf.concat(tf.split(Q2,self.attention_heads,axis=2),axis=0)
        K2_ = tf.concat(tf.split(K2,self.attention_heads,axis=2),axis=0)
        V2_ = tf.concat(tf.split(V2,self.attention_heads,axis=2),axis=0)
        
        outputs2 = tf.matmul(Q2_,tf.transpose(K2_,[0, 2, 1]))
        outputs2 = outputs2/(K2_.get_shape().as_list()[-1]**0.5)
        outputs2 = tf.where(tf.equal(outputs2,0),tf.ones_like(outputs2)*-1000,outputs2)
        outputs2 = tf.nn.dropout(tf.nn.softmax(outputs2),self.dropout)
        outputs2 = tf.where(tf.equal(mask2,0),tf.zeros_like(outputs2),outputs2)
        outputs2 = tf.matmul(outputs2,V2_)
        outputs2 = tf.concat(tf.split(outputs2,self.attention_heads,axis=0),axis=2)
        outputs2 = tf.where(tf.equal(mask,0),tf.zeros_like(outputs2),outputs2)
        
        outputs = tf.multiply(outputs1,outputs2)
        outputs = layer_norm(outputs)
        
        #word target attention
        Q = tf.get_variable('word_Q',(1,1,self.attention_size),
            tf.float32,tf.orthogonal_initializer())
        K = tf.layers.conv1d(outputs,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
          
        Q = tf.tile(Q,[self.max_lines,1,1])
        K = tf.where(tf.equal(mask,0),tf.zeros_like(K),K)
        
        Q_ = tf.concat(tf.split(Q,self.attention_heads,axis=2),axis=0)
        K_ = tf.concat(tf.split(K,self.attention_heads,axis=2),axis=0)
        V_ = tf.concat(tf.split(outputs,self.attention_heads,axis=2),axis=0)
        
        outputs = tf.matmul(Q_,tf.transpose(K_,[0, 2, 1]))
        outputs = outputs/(K_.get_shape().as_list()[-1]**0.5)
        outputs = tf.where(tf.equal(outputs,0),tf.ones_like(outputs)*-1000,outputs)
        outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout)
        outputs = tf.matmul(outputs,V_)
        outputs = tf.concat(tf.split(outputs,self.attention_heads,axis=0),axis=2)
        self.sent_embeds = tf.transpose(outputs,[1, 0, 2])
            
        #sentence positional embeddings
        positions = tf.expand_dims(tf.range(self.max_lines),0)
        sent_pos = tf.gather(tf.get_variable('sent_pos',shape=(self.ms,self.attention_size), 
                   dtype=tf.float32,initializer=tf.random_normal_initializer(0,0.1)),positions)
        self.sent_embeds = tf.nn.dropout(self.sent_embeds + sent_pos,self.dropout)
            
        #sentence self attention 1
        Q1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        K1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        V1 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        
        Q1_ = tf.concat(tf.split(Q1,self.attention_heads,axis=2),axis=0)
        K1_ = tf.concat(tf.split(K1,self.attention_heads,axis=2),axis=0)
        V1_ = tf.concat(tf.split(V1,self.attention_heads,axis=2),axis=0)
        
        outputs1 = tf.matmul(Q1_,tf.transpose(K1_,[0, 2, 1]))
        outputs1 = outputs1/(K1_.get_shape().as_list()[-1]**0.5)
        outputs1 = tf.nn.dropout(tf.nn.softmax(outputs1),self.dropout)
        outputs1 = tf.matmul(outputs1,V1_)
        outputs1 = tf.concat(tf.split(outputs1,self.attention_heads,axis=0),axis=2)
        
        #sentence self attention 2
        Q2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        K2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
        V2 = tf.layers.conv1d(self.sent_embeds,self.attention_size,3,padding='same',
            activation=tf.nn.tanh,kernel_initializer=tf.orthogonal_initializer())
        
        Q2_ = tf.concat(tf.split(Q2,self.attention_heads,axis=2),axis=0)
        K2_ = tf.concat(tf.split(K2,self.attention_heads,axis=2),axis=0)
        V2_ = tf.concat(tf.split(V2,self.attention_heads,axis=2),axis=0)
        
        outputs2 = tf.matmul(Q2_,tf.transpose(K2_,[0, 2, 1]))
        outputs2 = outputs2/(K2_.get_shape().as_list()[-1]**0.5)
        outputs2 = tf.nn.dropout(tf.nn.softmax(outputs2),self.dropout)
        outputs2 = tf.matmul(outputs2,V2_)
        outputs2 = tf.concat(tf.split(outputs2,self.attention_heads,axis=0),axis=2)
        
        outputs = tf.multiply(outputs1,outputs2)
        outputs = layer_norm(outputs)
        
        #sentence target attention
        Q = tf.get_variable('sent_Q',(1,1,self.attention_size),
            tf.float32,tf.orthogonal_initializer())
        K = tf.layers.conv1d(outputs,self.attention_size,3,padding='same',
            activation=activation,kernel_initializer=tf.orthogonal_initializer())
           
        Q_ = tf.concat(tf.split(Q,self.attention_heads,axis=2),axis=0)
        K_ = tf.concat(tf.split(K,self.attention_heads,axis=2),axis=0)
        V_ = tf.concat(tf.split(outputs,self.attention_heads,axis=2),axis=0)
        
        outputs = tf.matmul(Q_,tf.transpose(K_,[0, 2, 1]))
        outputs = outputs/(K_.get_shape().as_list()[-1]**0.5)
        outputs = tf.nn.dropout(tf.nn.softmax(outputs),self.dropout)
        outputs = tf.matmul(outputs,V_)
        outputs = tf.concat(tf.split(outputs,self.attention_heads,axis=0),axis=2)
        self.doc_embed = tf.nn.dropout(tf.squeeze(outputs,[0]),self.dropout)
        
        #classification functions
        self.output = tf.layers.dense(self.doc_embed,num_classes,
                      kernel_initializer=tf.orthogonal_initializer())
        self.prediction = tf.nn.softmax(self.output)
        
        #loss, accuracy, and training functions
        self.labels = tf.placeholder(tf.float32, shape=[num_classes])
        self.labels_rs = tf.expand_dims(self.labels,0)
        self.loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits
                    (logits=self.output,labels=self.labels_rs))
        self.optimizer = tf.train.AdamOptimizer(2e-5,0.9,0.99).minimize(self.loss)

        #init op
        self.init_op = tf.global_variables_initializer()
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.sess.run(self.init_op)
def train_net(batch_size=100,
              t_steps=200,
              l_dim=[100, 50, 5, 50, 100],
              activation='tanh',
              gamma=0.001,
              alpha_t=0.1,
              noise_str=0.1,
              err_alg=0,
              learning_rate=0.003,
              learning_rate_inv=0.003,
              learning_rate_rinv=0.1,
              num_steps_rinv=2,
              top_loss='sigmoid_ce',
              mode='autoencoder',
              dataset='mnist',
              SGD=True,
              preprocess=False,
              tb_path='/tmp/targprop/'):
  """
    Args:
      batch_size (int, > 0): the number of examples in each training batch
      t_steps (int, > 0): the number of training steps
      l_dim (list of ints): the layer dimensions
      activation (tanh, linear, sigmoid, relu): activation functions of network
      gamma (float, > 0): regularization parameter for regularized target prop
      alpha_t (float, (0, 1)): the 'learning rate' in target propagation, i.e. the top layer target is x - alpha_t* dL/dx
      err_alg (int, in [0, 1, 2, 3]): which error propagation algorithm to use
        0: backprop
        1: constrained least-squares target prop (essentially op-by-op difference target prop)
        2: regularized least-squares target prop (op-by-op)
        3: difference target prop using L_inv (close to a carbon copy of Lee et al)
      learning_rate (float, > 0): the learning rate in gradient descent.
      learning_rate_inv (float, > 0): the learning rate for L_inv if err_alg==3
      top_loss ('sigmoid_ce', softmax_ce', 'sigmoid_l2', 'l2'): the top-layer, defined by pre-loss nonlinearity and loss function
      mode ('autoencoder', 'classification'):
        'autoencoder': outputs are set to inputs
        'classification': outputs are set to labels
      dataset ('mnist', 'cifar'): which dataset to use. 
      SGD (bool): stochastic gradient descent. Should be True. False can be useful for debugging and seeing if algorithms converge on a single batch.
      preprocess (bool): preprocess the data with PCA + whitening. 
    Returns:
      output_dict
        output_dict['L']: list. loss for each training step
        output_dict['L_test']: float. loss for test data at final training step
        output_dict['accuracy']: accuracy of classification
        output_dict['accuracy_test']: accuracy on test set
        output_dict['actvs']: activations of last layer. for autoencoder mode. 
  """

  # data
  if dataset == 'cifar':
    data = ds.cifar10_data()
    data_test = ds.cifar10_data_test()
  elif dataset == 'mnist':
    data = ds.mnist_data()
    data_test = ds.mnist_data_test()
  else:
    # set train and test the same. change later.
    data = dataset
    data_test = dataset

  if preprocess:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=1000, whiten=True)
    data.inputs = pca.fit_transform(data.inputs)
    data_test.inputs = pca.transform(data_test.inputs)

  # autoencoderify
  if mode == 'autoencoder':
    data.outputs = data.inputs
    data_test.outputs = data_test.inputs

  # model parameters / architecture
  m_dim = data.inputs.shape[1] # input dimension
  p_dim = data.outputs.shape[1] # output dimension
  
  l_dim = [m_dim] + l_dim + [p_dim] # layer dimensions
  layers = len(l_dim)-1

  # operations from operations.py
  lin = ops.linear()
  add = ops.addition()

  # set activation function
  if activation == 'tanh':
    tf_act = tf.nn.tanh
    op_act = ops.tanh()
  elif activation == 'linear':
    tf_act = tf.identity
    op_act = ops.identity()
  elif activation == 'sigmoid':
    tf_act = tf.nn.sigmoid
    op_act = ops.sigmoid()
  elif activation == 'relu':
    tf_act = tf.nn.relu
    op_act = ops.relu()

  # put activations in lists
  acts    = (layers+1)*[None] # activation functions
  tf_acts = (layers+1)*[None] # activation functions
  for l in range(1, layers):
    acts[l]    = op_act
    tf_acts[l] = tf_act
  acts[-1]    = ops.identity() # last activation function is just identity, so we can offload the pre-loss nonlinearity to the 'loss' layer
  tf_acts[-1] = tf.identity

  def nonlin_layer(x_in, W_in, b_in):
    return tf_act(tf.matmul(x_in, W_in) + b_in)
  def affine_layer(x_in, W_in, b_in):
    return tf.matmul(x_in, W_in) + b_in

  # put op functions in lists...
  f = (layers+1)*[None]
  for l in range(1, layers):
    f[l] = nonlin_layer
  f[-1] = affine_layer

  # initialize variable lists
  W = (layers+1)*[None] # forward weights
  b = (layers+1)*[None] # biases
  
  train_op_W = (layers+1)*[None]
  train_op_p = (layers+1)*[None]
  train_op_tx = (layers+1)*[None]

  summary_ops = (layers+1)*[None]
  
  # initialize activation lists
  x = (layers+1)*[None]
  tx = (layers+1)*[None]
  p = (layers+1)*[None]

  loss = (layers+1)*[None]
  tloss = (layers+1)*[None]
  ploss = (layers+1)*[None]



  # create tensorflow graph with layer-local loss functions
  tf.reset_default_graph()
  
  # placeholders
  x[0] = tf.placeholder(tf.float32, shape=[None, l_dim[0]], name='input')
  tx[-1] = tf.placeholder(tf.float32, shape=[None, l_dim[-1]], name='output') 

  in_shape = x[0].get_shape()

  # 0 layer stuff
  tx[0] = tf.get_variable('layer0_ffx_tar', shape=[batch_size, l_dim[0]], dtype=tf.float32, initializer=tf.constant_initializer(0.))

  loss[0] = 0.
  tloss[0] = 0.
  ploss[0] = 0.

  opt = tf.train.RMSPropOptimizer(learning_rate)

  for l in range(1, layers+1):
    with tf.name_scope('layer'+str(l)+'_ff') as scope:
      W[l] = tf.get_variable(scope+'W', shape=[l_dim[l-1], l_dim[l]], dtype=tf.float32, initializer=tf.orthogonal_initializer(0.95))
      b[l] = tf.get_variable(scope+'b', shape=[1, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.))
      x[l] = f[l](x[l-1], W[l], b[l])

      tx[l] = tf.get_variable(scope+'x_tar', shape=[batch_size, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.))
      p[l] = tf.get_variable(scope+'p', shape=[batch_size, l_dim[l]], dtype=tf.float32, initializer=tf.constant_initializer(0.))

      if l == layers:
        # loss[l] = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=x[l], labels=tx[l]))
        # correct_prediction = tf.equal(tf.argmax( tf.nn.softmax(x[l]), 1 ), tf.argmax( tx[l], 1 ))
        loss[l] = 0.5*tf.reduce_mean( (x[l] - tx[l])**2. )
        correct_prediction = tf.equal(tf.argmax( x[l], 1 ), tf.argmax( tx[l], 1 ))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        accuracy_summary = tf.summary.scalar('accuracy', accuracy)
      elif l < layers:
        loss[l] = 0.5*tf.reduce_mean((x[l] - tx[l])**2.)

      summary_ops[l] = tf.summary.scalar('log_loss'+str(l), tf.log(loss[l]))

      # target loss term
      tloss[l] = 0.5*gamma*tf.nn.l2_loss(f[l](tx[l-1], W[l], b[l]) - tx[l])

      # Lagrange multiplier term
      ploss[l] = tf.reduce_sum(tf.multiply(p[l], f[l](tx[l-1], W[l], b[l]) - tx[l]))

      train_op_W[l] = opt.minimize(loss[l] + tloss[l] + ploss[l], var_list=[W[l], b[l]])
      train_op_p[l] = tf.train.GradientDescentOptimizer(gamma).minimize(-ploss[l], var_list=[p[l]])

  for l in range(0, layers):
    train_op_tx[l] = opt.minimize(loss[l] + tloss[l] + tloss[l+1] + ploss[l] + ploss[l+1], var_list=[tx[l]])

  merged = tf.summary.merge_all()
  writer = tf.summary.FileWriter(tb_path)

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())

  for t in range(t_steps+1):
    if SGD:
      x0, y = data.next_batch(batch_size)
    else:
      x0 = data.inputs[:batch_size]
      y = data.outputs[:batch_size]

    feed_dict = {x[0]: x0, tx[-1]: y}

    sess.run(train_op_tx[0], feed_dict=feed_dict)
    for l in range(1, layers):  
      sess.run(train_op_tx[l], feed_dict=feed_dict)
      sess.run(train_op_W[l], feed_dict=feed_dict)
    sess.run(train_op_W[-1], feed_dict=feed_dict)

    if t % 5 == 0:
      for l in range(1, layers+1):
        sess.run(train_op_p[l], feed_dict=feed_dict)

    if t % 1 == 0:
      writer.add_summary(sess.run(merged, feed_dict=feed_dict), t)
    if t % 20 == 0:
      print 'Iter: ', t, 'Loss, accuracy: ', sess.run([loss[-1], accuracy], feed_dict=feed_dict)

  # ( V ^__^) V   training complete   V (^__^ V )

  #feed_dict = {x[0]: data_test.inputs, tx[-1]: data_test.outputs}
  #L_test, accuracy_test = sess.run([loss[-1], accuracy], feed_dict=feed_dict)

  # prepare the output dictionary
  output_dict = {}
  #output_dict['L_test'] = L_test
  #output_dict['accuracy_test'] = accuracy_test

  # if mode == 'autoencoder':
  #   if top_loss == 'sigmoid_ce':
  #     output_dict['reconstruction'] = sess.run(tf.sigmoid(x3_test[-1][:20]))
  #   else:
  #     output_dict['reconstruction'] = x3_test[-1][:20] # save final layer activations (reconstructions)

  sess.close() # (= _ =) ..zzZZ

  return output_dict
示例#52
0
def self_cross_attention(
        config,
        context_embedded,
        context_len,
        candidate_embedded,
        candidate_len
        ):
    """

    :param config:
    :param context_embedded: shape = (batch_size, max_turn_num, max_turn_len, emb_size)
    :param context_len: shape = (batch_size, max_turn_num )
    :param candidate_embedded: shape = (batch_size, options_num, max_turn_len, emb_size)
    :param candidate_len: shape = (batch_size, options_num)
    :return:
    """

    # feature is a list of tensors which shape is (batch_size, max_turn_num, options_num, max_turn_len, max_turn_len)
    feature = [tf.einsum('bimn,bjmn->bij',context_embedded,candidate_embedded)]
    C_stack = [context_embedded]
    R_stack = [candidate_embedded]
    CR_stack = []
    RC_stack = []
    self_C = context_embedded
    self_R = candidate_embedded

    for i in range(config['stack_num']):
            with tf.variable_scope('self_stack_'+str(i), reuse=tf.AUTO_REUSE):
                # self_C.shape = (batch_size, max_turn_num, max_turn_len, emb_size)
                self_C = pab.self_block(Q=self_C, K=self_C, V=self_C, Q_lengths=context_len, K_lengths=context_len)
                # self_R.shape = (batch_size, options_num, max_turn_len, emb_size)
                self_R = pab.self_block(Q=self_R, K= self_R, V=self_R, Q_lengths=candidate_len, K_lengths=candidate_len)
            C_stack.append(self_C)
            R_stack.append(self_R)

            with tf.variable_scope('C_at_R_stack_'+str(i),tf.AUTO_REUSE):
                # cross_CR.shape = (batch_size, max_turn_num, options_num, max_turn_len, emb_size)
                cross_CR = pab.cross_block(Q=C_stack[i], K=R_stack[i], V=R_stack[i], Q_lengths=context_len, K_lengths=candidate_len)
            with tf.variable_scope('R_at_C_stack_',str(i),reuse=tf.AUTO_REUSE):
                # cross_RC.shape = (batch_size, options_num, max_turn_num, max_turn_len, emb_size)
                cross_RC = pab.cross_block(Q=R_stack[i], K=C_stack[i], V=C_stack[i], Q_lengths=candidate_len, K_lengths=context_len)
            CR_stack.append(cross_CR)
            RC_stack.append(cross_RC)

    CR_stack.append(pab.cross_block(Q=C_stack[-1], K=R_stack[-1], V=R_stack[-1], Q_lengths=context_len, K_lengths=candidate_len))
    RC_stack.append(pab.cross_block(Q=R_stack[-1], K=C_stack[-1], V=C_stack[-1], Q_lengths=candidate_len, K_lengths=context_len))
    # self_feature.shape = (batch_size, options_num, max_turn_num, max_turn_len, max_turn_len, stack_num)
    self_F = tf.einsum('bijks,bmnks->bimjns',tf.stack(R_stack,axis=-1),tf.stack(C_stack,axis=-1)) / tf.sqrt(200.0)
    # cross_feature.shape = (batch_size, options_num, max_turn_num, max_turn_len, max_turn_len, stack_num)
    cross_F = tf.einsum('bijkls,bjizls->bijkzs', tf.stack(RC_stack,axis=-1), tf.stack(CR_stack,axis=-1)) / tf.sqrt(200.0)

    # feature.shape = (batch_size * options_num, max_turn_num, max_turn_len, max_turn_len, stack_num)
    feature = tf.reshape(tf.concat([self_F,cross_F],axis=-1), shape=[-1, self_F.shape[2], self_F.shape[3], self_F.shape[4], self_F.shape[5]+cross_F.shape[5]])
    with tf.variable_scope('cnn_aggregation'):
        final_info = pop.CNN_3d(feature,32,16)
    with tf.variable_scope('linear'):
        W = tf.get_variable(
            name='weights',
            shape=[final_info.shape[-1], 1],
            initializer=tf.orthogonal_initializer())
        bias = tf.get_variable(
            name='bias',
            shape=[1],
            initializer=tf.zeros_initializer())

        logits = tf.reshape(tf.matmul(final_info, W) + bias, [-1,self_F.shape[1]])

    probs = tf.nn.softmax(logits)

    return probs, logits
示例#53
0
 def testDuplicatedInitializer(self):
   init = tf.orthogonal_initializer()
   self.assertFalse(duplicated_initializer(self, init, 1, (10, 10)))
示例#54
0
 def lstm_cell(self):
     return tf.nn.rnn_cell.LSTMCell(self.rnn_size,
                                    initializer=tf.orthogonal_initializer())
示例#55
0
    def __call__(self, inputs, state, scope=None):
        with tf.variable_scope(scope or "simple_lstm_cell", reuse=self._reuse):
            c, h = state
            if not hasattr(self, '_wi'):
                self._wi = tf.get_variable('simple_lstm_cell_wi', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bi = tf.get_variable('simple_lstm_cell_bi', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
                self._wo = tf.get_variable('simple_lstm_cell_wo', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bo = tf.get_variable('simple_lstm_cell_bo', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
                self._wc = tf.get_variable('simple_lstm_cell_wc', dtype=tf.float32, shape=[inputs.get_shape()[-1] + h.get_shape()[-1], self._num_units], initializer=tf.orthogonal_initializer())
                self._bc = tf.get_variable('simple_lstm_cell_bc', dtype=tf.float32, shape=[self._num_units], initializer=tf.constant_initializer(0.0))
            i = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wi) + self._bi)
            o = tf.nn.sigmoid(tf.matmul(tf.concat([inputs, h], 1), self._wo) + self._bo)
            _c = self._activation(tf.matmul(tf.concat([inputs, h], 1), self._wc) + self._bc)
            # remove forget gate according to the paper
            new_c = c + i * _c
            new_h = o * self._activation(new_c)

            return new_h, (new_c, new_h)
        h = slim.stack(tf.divide(x, 4.0),
                       slim.fully_connected, [n_hidden] * n_layer,
                       activation_fn=tf.nn.relu)
        log_d = slim.fully_connected(h, 1, activation_fn=None)
    return log_d


tf.reset_default_graph()

data = sample_mog(params['batch_size'])

noise = ds.Normal(tf.zeros(params['z_dim']),
                  tf.ones(params['z_dim'])).sample(params['batch_size'])
# Construct generator and discriminator nets
with slim.arg_scope([slim.fully_connected],
                    weights_initializer=tf.orthogonal_initializer(gain=0.8)):
    samples = generator(noise, output_dim=params['x_dim'])
    real_score = discriminator(data)
    fake_score = discriminator(samples, reuse=True)

# D maximizes this, G minimizes this + a regularizer
V = -tf.reduce_mean(
    tf.nn.sigmoid_cross_entropy_with_logits(logits=real_score,
                                            labels=tf.ones_like(real_score)) +
    tf.nn.sigmoid_cross_entropy_with_logits(logits=fake_score,
                                            labels=tf.zeros_like(fake_score)))

gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "generator")
disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                              "discriminator")
示例#57
0
 def testInitializerDifferent(self):
   for dtype in [tf.float32, tf.float64]:
     init1 = tf.orthogonal_initializer(seed=1, dtype=dtype)
     init2 = tf.orthogonal_initializer(seed=2, dtype=dtype)
     self.assertFalse(identicaltest(self, init1, init2, (10, 10)))
示例#58
0
def smn_model(input_x,
              input_x_mask,
              input_y,
              input_y_mask,
              word_emb,
              keep_rate,
              conf,
              x_len=None,
              y_len=None):

    turns1_e = tf.nn.embedding_lookup(word_emb, input_x)
    response_e = tf.nn.embedding_lookup(word_emb, input_y)

    response_embeddings = response_e

    rnn_units = 200
    sentence_GRU = tf.nn.rnn_cell.GRUCell(
        rnn_units, kernel_initializer=tf.orthogonal_initializer())
    all_utterance_embeddings = tf.unstack(turns1_e,
                                          num=conf["max_turn_num"],
                                          axis=1)
    all_utterance_len = tf.unstack(x_len, num=conf["max_turn_num"], axis=1)
    A_matrix = tf.get_variable(
        'A_matrix_v',
        shape=(rnn_units, rnn_units),
        initializer=tf.contrib.layers.xavier_initializer(),
        dtype=tf.float32)
    final_GRU = tf.nn.rnn_cell.GRUCell(
        rnn_units, kernel_initializer=tf.orthogonal_initializer())
    reuse = None

    response_GRU_embeddings, _ = tf.nn.dynamic_rnn(sentence_GRU,
                                                   response_embeddings,
                                                   sequence_length=y_len,
                                                   dtype=tf.float32,
                                                   scope='sentence_GRU')
    response_embeddings = tf.transpose(response_embeddings, perm=[0, 2, 1])
    response_GRU_embeddings = tf.transpose(response_GRU_embeddings,
                                           perm=[0, 2, 1])
    matching_vectors = []
    for utterance_embeddings, utterance_len in zip(all_utterance_embeddings,
                                                   all_utterance_len):
        matrix1 = tf.matmul(utterance_embeddings, response_embeddings)
        utterance_GRU_embeddings, _ = tf.nn.dynamic_rnn(
            sentence_GRU,
            utterance_embeddings,
            sequence_length=utterance_len,
            dtype=tf.float32,
            scope='sentence_GRU')
        matrix2 = tf.einsum('aij,jk->aik', utterance_GRU_embeddings,
                            A_matrix)  # TODO:check this
        matrix2 = tf.matmul(matrix2, response_GRU_embeddings)
        matrix = tf.stack([matrix1, matrix2], axis=3, name='matrix_stack')
        conv_layer = tf.layers.conv2d(
            matrix,
            filters=8,
            kernel_size=(3, 3),
            padding='VALID',
            kernel_initializer=tf.contrib.keras.initializers.he_normal(),
            activation=tf.nn.relu,
            reuse=reuse,
            name='conv')  # TODO: check other params
        pooling_layer = tf.layers.max_pooling2d(
            conv_layer, (3, 3),
            strides=(3, 3),
            padding='VALID',
            name='max_pooling')  # TODO: check other params
        matching_vector = tf.layers.dense(
            tf.contrib.layers.flatten(pooling_layer),
            50,
            kernel_initializer=tf.contrib.layers.xavier_initializer(),
            activation=tf.tanh,
            reuse=reuse,
            name='matching_v')  # TODO: check wthether this is correct
        if not reuse:
            reuse = True
        matching_vectors.append(matching_vector)
    _, last_hidden = tf.nn.dynamic_rnn(
        final_GRU,
        tf.stack(matching_vectors, axis=0, name='matching_stack'),
        dtype=tf.float32,
        time_major=True,
        scope='final_GRU')  # TODO: check time_major
    #logits = tf.layers.dense(last_hidden, 2, kernel_initializer=tf.contrib.layers.xavier_initializer(), name='final_v')
    #self.y_pred = tf.nn.softmax(logits)
    #self.total_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.y_true, logits=logits))
    #tf.summary.scalar('loss', self.total_loss)
    #optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    #self.train_op = optimizer.minimize(self.total_loss)

    return last_hidden
示例#59
0
 def cell():
     cell = tf.nn.rnn_cell.LSTMCell(self.cell_size, initializer=tf.orthogonal_initializer())
     return cell
def fc_layer(inputs, units, activation_fn=tf.nn.relu, gain=1.0):
    return tf.layers.dense(inputs=inputs,
                           units=units,
                           activation=activation_fn,
                           kernel_initializer=tf.orthogonal_initializer(gain))