def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, nematus_compat=False, dropout_input=None, dropout_state=None): self.state_to_gates = tf.Variable(numpy.concatenate( [ortho_weight(state_size), ortho_weight(state_size)], axis=1), name='state_to_gates') self.input_to_gates = tf.Variable(numpy.concatenate([ norm_weight(input_size, state_size), norm_weight(input_size, state_size) ], axis=1), name='input_to_gates') self.gates_bias = tf.Variable(numpy.zeros( (2 * state_size, )).astype('float32'), name='gates_bias') self.state_to_proposal = tf.Variable(ortho_weight(state_size), name='state_to_proposal') self.input_to_proposal = tf.Variable(norm_weight( input_size, state_size), name='input_to_proposal') self.proposal_bias = tf.Variable(numpy.zeros( (state_size, )).astype('float32'), name='proposal_bias') self.nematus_compat = nematus_compat self.use_layer_norm = use_layer_norm if self.use_layer_norm: with tf.name_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2 * state_size) with tf.name_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2 * state_size) with tf.name_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) with tf.name_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def param_init_gru(options, params, prefix='gru', nin=None, dim=None): if nin is None: nin = options['dim_proj'] if dim is None: dim = options['dim_proj'] # embedding to gates transformation weights, biases W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[pp(prefix, 'W')] = W params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') # recurrent transformation weights for gates U = numpy.concatenate([ortho_weight(dim), ortho_weight(dim)], axis=1) params[pp(prefix, 'U')] = U # embedding to hidden state proposal weights, biases Wx = norm_weight(nin, dim) params[pp(prefix, 'Wx')] = Wx params[pp(prefix, 'bx')] = numpy.zeros((dim, )).astype('float32') # recurrent transformation weights for hidden state proposal Ux = ortho_weight(dim) params[pp(prefix, 'Ux')] = Ux return params
def __init__(self, context, context_state_size, context_mask, state_size, hidden_size, use_layer_norm=False, dropout_context=None, dropout_state=None): init = initializers.norm_weight(state_size, hidden_size) self.state_to_hidden = tf.compat.v1.get_variable('state_to_hidden', initializer=init) #TODO: Nematus uses ortho_weight here - important? init = initializers.norm_weight(context_state_size, hidden_size) self.context_to_hidden = tf.compat.v1.get_variable('context_to_hidden', initializer=init) self.hidden_bias = tf.compat.v1.get_variable( 'hidden_bias', [hidden_size], initializer=tf.zeros_initializer) init = initializers.norm_weight(hidden_size, 1) self.hidden_to_score = tf.compat.v1.get_variable('hidden_to_score', initializer=init) self.use_layer_norm = use_layer_norm if self.use_layer_norm: with tf.compat.v1.variable_scope('hidden_context_norm'): self.hidden_context_norm = self.use_layer_norm( layer_size=hidden_size) with tf.compat.v1.variable_scope('hidden_state_norm'): self.hidden_state_norm = self.use_layer_norm( layer_size=hidden_size) self.context = context self.context_mask = context_mask batch_size = tf.shape(input=context)[1] # Create a dropout mask for context values (reused at every timestep). if dropout_context == None: self.dropout_mask_context_to_hidden = None else: ones = tf.ones([batch_size, context_state_size]) self.dropout_mask_context_to_hidden = dropout_context(ones) # Create a dropout mask for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_hidden = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_hidden = dropout_state(ones) # precompute these activations, they are the same at each step # Ideally the compiler would have figured out that too context = apply_dropout_mask(context, self.dropout_mask_context_to_hidden, True) self.hidden_from_context = matmul3d(context, self.context_to_hidden) self.hidden_from_context += self.hidden_bias if self.use_layer_norm: self.hidden_from_context = \ self.hidden_context_norm.forward(self.hidden_from_context)
def __init__(self, context, context_state_size, context_mask, state_size, hidden_size, use_layer_norm=False, dropout_context=None, dropout_state=None): self.state_to_hidden = tf.Variable( norm_weight(state_size, hidden_size), name='state_to_hidden') self.context_to_hidden = tf.Variable( #TODO: Nematus uses ortho_weight here - important? norm_weight(context_state_size, hidden_size), name='context_to_hidden') self.hidden_bias = tf.Variable( numpy.zeros((hidden_size,)).astype('float32'), name='hidden_bias') self.hidden_to_score = tf.Variable( norm_weight(hidden_size, 1), name='hidden_to_score') self.use_layer_norm = use_layer_norm if self.use_layer_norm: with tf.name_scope('hidden_context_norm'): self.hidden_context_norm = LayerNormLayer(layer_size=hidden_size) with tf.name_scope('hidden_state_norm'): self.hidden_state_norm = LayerNormLayer(layer_size=hidden_size) self.context = context self.context_mask = context_mask batch_size = tf.shape(context)[1] # Create a dropout mask for context values (reused at every timestep). if dropout_context == None: self.dropout_mask_context_to_hidden = None else: ones = tf.ones([batch_size, context_state_size]) self.dropout_mask_context_to_hidden = dropout_context(ones) # Create a dropout mask for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_hidden = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_hidden = dropout_state(ones) # precompute these activations, they are the same at each step # Ideally the compiler would have figured out that too context = apply_dropout_mask(context, self.dropout_mask_context_to_hidden, True) self.hidden_from_context = matmul3d(context, self.context_to_hidden) self.hidden_from_context += self.hidden_bias if self.use_layer_norm: self.hidden_from_context = \ self.hidden_context_norm.forward(self.hidden_from_context, input_is_3d=True)
def __init__(self, context, context_state_size, context_mask, state_size, hidden_size, use_layer_norm=False, dropout_context=None, dropout_state=None): init = initializers.norm_weight(state_size, hidden_size) self.state_to_hidden = tf.get_variable('state_to_hidden', initializer=init) #TODO: Nematus uses ortho_weight here - important? init = initializers.norm_weight(context_state_size, hidden_size) self.context_to_hidden = tf.get_variable('context_to_hidden', initializer=init) self.hidden_bias = tf.get_variable('hidden_bias', [hidden_size], initializer=tf.zeros_initializer) init = initializers.norm_weight(hidden_size, 1) self.hidden_to_score = tf.get_variable('hidden_to_score', initializer=init) self.use_layer_norm = use_layer_norm if self.use_layer_norm: with tf.variable_scope('hidden_context_norm'): self.hidden_context_norm = LayerNormLayer(layer_size=hidden_size) with tf.variable_scope('hidden_state_norm'): self.hidden_state_norm = LayerNormLayer(layer_size=hidden_size) self.context = context self.context_mask = context_mask batch_size = tf.shape(context)[1] # Create a dropout mask for context values (reused at every timestep). if dropout_context == None: self.dropout_mask_context_to_hidden = None else: ones = tf.ones([batch_size, context_state_size]) self.dropout_mask_context_to_hidden = dropout_context(ones) # Create a dropout mask for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_hidden = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_hidden = dropout_state(ones) # precompute these activations, they are the same at each step # Ideally the compiler would have figured out that too context = apply_dropout_mask(context, self.dropout_mask_context_to_hidden, True) self.hidden_from_context = matmul3d(context, self.context_to_hidden) self.hidden_from_context += self.hidden_bias if self.use_layer_norm: self.hidden_from_context = \ self.hidden_context_norm.forward(self.hidden_from_context)
def __init__(self, vocabulary_sizes, dim_per_factor): assert len(vocabulary_sizes) == len(dim_per_factor) self.embedding_matrices = [ tf.Variable(norm_weight(vocab_size, dim), name='embeddings') for vocab_size, dim in zip(vocabulary_sizes, dim_per_factor)]
def __init__(self, vocabulary_sizes, dim_per_factor): assert len(vocabulary_sizes) == len(dim_per_factor) self.embedding_matrices = [] for i in range(len(vocabulary_sizes)): vocab_size, dim = vocabulary_sizes[i], dim_per_factor[i] var_name = 'embeddings' if i == 0 else 'embeddings_' + str(i) init = initializers.norm_weight(vocab_size, dim) matrix = tf.get_variable(var_name, initializer=init) self.embedding_matrices.append(matrix)
def init_params(options): params = OrderedDict() #embedding params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word']) params['Wemb_dec'] = norm_weight(options['n_words_tgt'], options['dim_word']) #encoder: bidirectional RNN params = param_init_gru(options,params, prefix='encoder', nin=options['dim_word'], dim=options['dim']) params = param_init_gru(options,params, prefix='encoder_r', nin=options['dim_word'], dim=options['dim']) ctxdim = 2*options['dim'] #init state, init cell params = param_init_fflayer(options,params,prefix='ff_state', nin=ctxdim,nout=options['dim']) #decoder params = param_init_gru_cond(options,params, prefix='decoder', nin=options['dim_word'], dim=options['dim'], dimctx=ctxdim) #readout params = param_init_fflayer(options,params,prefix='ff_logit_lstm', nin=options['dim'],nout=options['dim_word'], ortho=False) params = param_init_fflayer(options,params,prefix='ff_logit_prev', nin=options['dim_word'], nout=options['dim_word'],ortho=False) params = param_init_fflayer(options,params,prefix='ff_logit_ctx', nin=ctxdim,nout=options['dim_word'], ortho=False) params = param_init_fflayer(options,params,prefix='ff_logit', nin=options['dim_word'], nout=options['n_words_tgt']) return params
def param_init_fflayer(options, params, prefix='ff', nin=None, nout=None, ortho=True): if nin is None: nin = options['dim_proj'] if nout is None: nout = options['dim_proj'] params[pp(prefix, 'W')] = norm_weight(nin, nout, scale=0.01, ortho=ortho) params[pp(prefix, 'b')] = numpy.zeros((nout, )).astype('float32') return params
def __init__(self, in_size, out_size, batch_size, non_linearity=tf.nn.tanh, W=None, use_layer_norm=False, dropout_input=None): if W is None: W = tf.Variable(norm_weight(in_size, out_size), name='W') self.W = W self.b = tf.Variable(numpy.zeros((out_size,)).astype('float32'), name='b') self.non_linearity = non_linearity self.use_layer_norm = use_layer_norm if use_layer_norm: self.layer_norm = LayerNormLayer(layer_size=out_size) # Create a dropout mask for input values (reused at every timestep). if dropout_input == None: self.dropout_mask = None else: ones = tf.ones([batch_size, in_size]) self.dropout_mask = dropout_input(ones)
def __init__(self, in_size, out_size, batch_size, non_linearity=tf.nn.tanh, W=None, use_layer_norm=False, dropout_input=None): if W is None: init = initializers.norm_weight(in_size, out_size) W = tf.get_variable('W', initializer=init) self.W = W self.b = tf.get_variable('b', [out_size], initializer=tf.zeros_initializer) self.non_linearity = non_linearity self.use_layer_norm = use_layer_norm if use_layer_norm: self.layer_norm = LayerNormLayer(layer_size=out_size) # Create a dropout mask for input values (reused at every timestep). if dropout_input == None: self.dropout_mask = None else: ones = tf.ones([batch_size, in_size]) self.dropout_mask = dropout_input(ones)
def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE, dropout_input=None, dropout_state=None): init = tf.concat([ initializers.ortho_weight(state_size), initializers.ortho_weight(state_size) ], axis=1) self.state_to_gates = tf.get_variable('state_to_gates', initializer=init) if input_size > 0: init = tf.concat([ initializers.norm_weight(input_size, state_size), initializers.norm_weight(input_size, state_size) ], axis=1) self.input_to_gates = tf.get_variable('input_to_gates', initializer=init) if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE: self.gates_bias = None else: self.gates_bias = tf.get_variable('gates_bias', [2 * state_size], initializer=tf.zeros_initializer) init = initializers.ortho_weight(state_size) self.state_to_proposal = tf.get_variable('state_to_proposal', initializer=init) if input_size > 0: init = initializers.norm_weight(input_size, state_size) self.input_to_proposal = tf.get_variable('input_to_proposal', initializer=init) if input_size == 0 and legacy_bias_type == LegacyBiasType.NEMATUS_COMPAT_FALSE: self.proposal_bias = None else: self.proposal_bias = tf.get_variable( 'proposal_bias', [state_size], initializer=tf.zeros_initializer) self.legacy_bias_type = legacy_bias_type self.use_layer_norm = use_layer_norm self.gates_state_norm = None self.proposal_state_norm = None self.gates_x_norm = None self.proposal_x_norm = None if self.use_layer_norm: with tf.variable_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2 * state_size) with tf.variable_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) if input_size > 0: with tf.variable_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2 * state_size) with tf.variable_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def __init__(self, vocabulary_size, dim_per_factor): self.embedding_matrices = [ tf.Variable(norm_weight(vocabulary_size, dim), name='embeddings') for dim in dim_per_factor ]
def init_params(options): params = OrderedDict() # embedding for factor in range(options['factors']): params[embedding_name(factor)] = norm_weight( options['n_words_src'], options['dim_per_factor'][factor]) params['Wemb_dec'] = norm_weight(options['n_words'], options['dim_word']) # encoder: bidirectional RNN params = get_layer_param(options['encoder'])(options, params, prefix='encoder', nin=options['dim_word'], dim=options['dim']) params = get_layer_param(options['encoder'])(options, params, prefix='encoder_r', nin=options['dim_word'], dim=options['dim']) ctxdim = 2 * options['dim'] # init_state, init_cell params = get_layer_param('ff')(options, params, prefix='ff_state', nin=ctxdim, nout=options['dim']) # decoder params = get_layer_param(options['decoder'])(options, params, prefix='decoder', nin=options['dim_word'], dim=options['dim'], dimctx=ctxdim) # readout params = get_layer_param('ff')(options, params, prefix='ff_logit_lstm', nin=options['dim'], nout=options['dim_word'], ortho=False) params = get_layer_param('ff')(options, params, prefix='ff_logit_prev', nin=options['dim_word'], nout=options['dim_word'], ortho=False) params = get_layer_param('ff')(options, params, prefix='ff_logit_ctx', nin=ctxdim, nout=options['dim_word'], ortho=False) params = get_layer_param('ff')(options, params, prefix='ff_logit', nin=options['dim_word'], nout=options['n_words']) return params
def __init__(self, vocabulary_size, embedding_size): self.embeddings = tf.Variable(norm_weight(vocabulary_size, embedding_size), name='embeddings')
def __init__(self, input_size, state_size, batch_size, use_layer_norm=False, legacy_bias_type=LegacyBiasType.NEMATUS_COMPAT_FALSE, dropout_input=None, dropout_state=None): init = tf.concat([initializers.ortho_weight(state_size), initializers.ortho_weight(state_size)], axis=1) self.state_to_gates = tf.get_variable('state_to_gates', initializer=init) if input_size > 0: init = tf.concat([initializers.norm_weight(input_size, state_size), initializers.norm_weight(input_size, state_size)], axis=1) self.input_to_gates = tf.get_variable('input_to_gates', initializer=init) if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A: self.gates_bias = tf.get_variable('gates_bias', [2*state_size], initializer=tf.zeros_initializer) else: self.gates_bias = None init = initializers.ortho_weight(state_size) self.state_to_proposal = tf.get_variable('state_to_proposal', initializer=init) if input_size > 0: init = initializers.norm_weight(input_size, state_size) self.input_to_proposal = tf.get_variable('input_to_proposal', initializer=init) if input_size > 0 or legacy_bias_type == LegacyBiasType.THEANO_A: self.proposal_bias = tf.get_variable('proposal_bias', [state_size], initializer=tf.zeros_initializer) else: self.proposal_bias = None self.legacy_bias_type = legacy_bias_type self.use_layer_norm = use_layer_norm self.gates_state_norm = None self.proposal_state_norm = None self.gates_x_norm = None self.proposal_x_norm = None if self.use_layer_norm: with tf.variable_scope('gates_state_norm'): self.gates_state_norm = LayerNormLayer(2*state_size) with tf.variable_scope('proposal_state_norm'): self.proposal_state_norm = LayerNormLayer(state_size) if input_size > 0: with tf.variable_scope('gates_x_norm'): self.gates_x_norm = LayerNormLayer(2*state_size) with tf.variable_scope('proposal_x_norm'): self.proposal_x_norm = LayerNormLayer(state_size) # Create dropout masks for input values (reused at every timestep). if dropout_input == None: self.dropout_mask_input_to_gates = None self.dropout_mask_input_to_proposal = None else: ones = tf.ones([batch_size, input_size]) self.dropout_mask_input_to_gates = dropout_input(ones) self.dropout_mask_input_to_proposal = dropout_input(ones) # Create dropout masks for state values (reused at every timestep). if dropout_state == None: self.dropout_mask_state_to_gates = None self.dropout_mask_state_to_proposal = None else: ones = tf.ones([batch_size, state_size]) self.dropout_mask_state_to_gates = dropout_state(ones) self.dropout_mask_state_to_proposal = dropout_state(ones)
def param_init_gru_cond(options, params, prefix='gru_cond', nin=None, dim=None, dimctx=None, nin_nonlin=None, dim_nonlin=None): if nin is None: nin = options['dim'] if dim is None: dim = options['dim'] if dimctx is None: dimctx = options['dim'] if nin_nonlin is None: nin_nonlin = nin if dim_nonlin is None: dim_nonlin = dim W = numpy.concatenate( [norm_weight(nin, dim), norm_weight(nin, dim)], axis=1) params[pp(prefix, 'W')] = W params[pp(prefix, 'b')] = numpy.zeros((2 * dim, )).astype('float32') U = numpy.concatenate([ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[pp(prefix, 'U')] = U Wx = norm_weight(nin_nonlin, dim_nonlin) params[pp(prefix, 'Wx')] = Wx Ux = ortho_weight(dim_nonlin) params[pp(prefix, 'Ux')] = Ux params[pp(prefix, 'bx')] = numpy.zeros((dim_nonlin, )).astype('float32') U_nl = numpy.concatenate( [ortho_weight(dim_nonlin), ortho_weight(dim_nonlin)], axis=1) params[pp(prefix, 'U_nl')] = U_nl params[pp(prefix, 'b_nl')] = numpy.zeros( (2 * dim_nonlin, )).astype('float32') Ux_nl = ortho_weight(dim_nonlin) params[pp(prefix, 'Ux_nl')] = Ux_nl params[pp(prefix, 'bx_nl')] = numpy.zeros((dim_nonlin, )).astype('float32') # context to LSTM Wc = norm_weight(dimctx, dim * 2) params[pp(prefix, 'Wc')] = Wc Wcx = norm_weight(dimctx, dim) params[pp(prefix, 'Wcx')] = Wcx # attention: combined -> hidden W_comb_att = norm_weight(dim, dimctx) params[pp(prefix, 'W_comb_att')] = W_comb_att # attention: context -> hidden Wc_att = norm_weight(dimctx) params[pp(prefix, 'Wc_att')] = Wc_att # attention: hidden bias b_att = numpy.zeros((dimctx, )).astype('float32') params[pp(prefix, 'b_att')] = b_att # attention: U_att = norm_weight(dimctx, 1) params[pp(prefix, 'U_att')] = U_att c_att = numpy.zeros((1, )).astype('float32') params[pp(prefix, 'c_tt')] = c_att return params