def build(self, input_shape): if not isinstance(input_shape, list) or len(input_shape) != 2: raise ValueError( 'A `SelfMultiHeadAttention` layer should be called ' 'on a list of 2 tensors') if len(input_shape[0]) != 3 or len(input_shape[1]) != 2: raise ValueError( 'input: [N, T_k, d_model], key masks: [N, key_seqlen]') embedding_size = int(input_shape[0][-1]) if self.num_units == None: self.num_units = embedding_size self.W = self.add_weight(name='Q_K_V', shape=[embedding_size, self.num_units * 3], dtype=tf.float32, initializer=TruncatedNormal(seed=self.seed)) self.W_output = self.add_weight( name='output_W', shape=[self.num_units, self.num_units], dtype=tf.float32, initializer=TruncatedNormal(seed=self.seed)) self.layer_norm = LayerNormalization() self.attention = DotAttention(scale=self.scale) self.softmax_weight_sum = SoftmaxWeightedSum( dropout_rate=self.dropout_rate, future_binding=self.future_binding, seed=self.seed) self.dropout = Dropout(self.dropout_rate, seed=self.seed) self.seq_len_max = int(input_shape[0][1]) # Be sure to call this somewhere! super(SelfMultiHeadAttention, self).build(input_shape)
def build(self, input_shape): # Create a trainable weight variable for this layer. if self.sess_max_count == 1: embed_size = input_shape[2].value seq_len_max = input_shape[1].value else: embed_size = input_shape[0][2].value seq_len_max = input_shape[0][1].value self.sess_bias_embedding = self.add_weight( 'sess_bias_embedding', shape=(self.sess_max_count, 1, 1), initializer=TruncatedNormal(mean=0.0, stddev=0.0001, seed=self.seed)) self.seq_bias_embedding = self.add_weight('seq_bias_embedding', shape=(1, seq_len_max, 1), initializer=TruncatedNormal( mean=0.0, stddev=0.0001, seed=self.seed)) self.item_bias_embedding = self.add_weight('item_bias_embedding', shape=(1, 1, embed_size), initializer=TruncatedNormal( mean=0.0, stddev=0.0001, seed=self.seed)) # Be sure to call this somewhere! super(BiasEncoding, self).build(input_shape)
def get_embedding(region_num, region_feature_dim_dict, base_feature_dim_dict, bias_feature_dim_dict, init_std, seed, l2_reg_linear): region_embeddings = [[ Embedding(feat.dimension, 1, embeddings_initializer=TruncatedNormal(stddev=init_std, seed=seed + j), embeddings_regularizer=l2(l2_reg_linear), name='region_emb_' + str(j) + '_' + str(i)) for i, feat in enumerate(region_feature_dim_dict['sparse']) ] for j in range(region_num)] base_embeddings = [[ Embedding(feat.dimension, 1, embeddings_initializer=TruncatedNormal(stddev=init_std, seed=seed + j), embeddings_regularizer=l2(l2_reg_linear), name='base_emb_' + str(j) + '_' + str(i)) for i, feat in enumerate(base_feature_dim_dict['sparse']) ] for j in range(region_num)] bias_embedding = [ Embedding(feat.dimension, 1, embeddings_initializer=TruncatedNormal(stddev=init_std, seed=seed), embeddings_regularizer=l2(l2_reg_linear), name='embed_bias' + '_' + str(i)) for i, feat in enumerate(bias_feature_dim_dict['sparse']) ] return region_embeddings, base_embeddings, bias_embedding
def build(self, input_shape): embedding_size = int(input_shape[0][-1]) # wq_wk_wv 放到一块 self.W = self.add_weight(name='Q_K_V', shape=[embedding_size, self.num_units*3], dtype=tf.float32, initializer=TruncatedNormal(seed=self.seed)) self.W_output = self.add_weight(name='output_W', shape=[self.num_units, self.num_units], dtype=tf.float32, initializer=TruncatedNormal(seed=self.seed)) self.layer_norm = LayerNormalization() self.dropout = Dropout(self.dropout_rate, seed=self.seed) self.seq_len_max = int(input_shape[0][1]) super(MultiHeadAttention, self).build(input_shape)
def __init__( self, kind: str, n_units, n_layers=1, # Its not obvious how to compute fan_in/fan_out for these models # so we recommend avoiding glorot initialization for now w_init=TruncatedNormal(stddev=0.05), recurrent_init=None, bidirectional=True, learn_initial_states: bool = False, lstm_bias=1, keep_recurrent: float = 1): if bidirectional is None or n_layers is None or n_units is None: raise ValueError() if kind not in ["GRU", "LSTM"]: raise ValueError() self._kind = kind self.keep_recurrent = keep_recurrent self.lstm_bias = lstm_bias self.n_units = n_units self.n_layers = n_layers self.bidirectional = bidirectional self.w_init = w_init self.recurrent_init = recurrent_init self.learn_initial_states = learn_initial_states
def get_initializer(initializer_params): if initializer_params["function"] == "truncated_normal": return TruncatedNormal(stddev=initializer_params["stddev"]) elif initializer_params["function"] == "constant": return Constant(value=initializer_params["value"]) else: return initializer_params["function"]
def mvm(embeddings, factor_size): num_features = int(embeddings.shape.dims[1]) bias = tf.get_variable("padding_bias", (num_features, factor_size), initializer=TruncatedNormal(stddev=0.02)) all_order = tf.add(embeddings, bias) out = all_order[:, 0, :] # B x 1 x factor_size for i in range(1, num_features): out = tf.multiply(out, all_order[:, i, :]) out = tf.reshape(out, shape=[-1, factor_size]) return out
def __init__(self, n_units, n_layers=1, lstm_bias=1, w_init=TruncatedNormal(stddev=0.05), recurrent_init=None, bidirectional=True, learn_initial_states=False): super().__init__("LSTM", n_units, n_layers, w_init, recurrent_init, bidirectional, learn_initial_states, lstm_bias)
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True) img_size = 28 img_size_flat = 784 img_shape = [28, 28] img_shape_full = [28, 28, 1] n_classes = 10 num_channels = 1 n_layers = 2 n_neurones = [] n_neurones.extend([484] * n_layers) n_neurones.append(n_classes) ini = TruncatedNormal(mean=0.0, stddev=0.1, seed=None) optimizer = Adam(lr=1e-3) model = Sequential() model.add(InputLayer(input_shape=(img_size_flat, ))) for i in range(n_layers - 1): model.add( Dense(n_neurones[i], kernel_initializer=ini, bias_initializer=ini, activation='relu')) model.add(Reshape([22, 22, 1])) model.add( Conv2D(kernel_size=5,
def main(): parser = argparse.ArgumentParser("Train our ELMo model on SQuAD") parser.add_argument("output_dir") parser.add_argument("--dim", type=int, default=90) parser.add_argument("--l2", type=float, default=0) parser.add_argument("--mode", choices=["input", "output", "both", "none"], default="both") parser.add_argument("--top_layer_only", action="store_true") args = parser.parse_args() out = args.output_dir + "-" + datetime.now().strftime("%m%d-%H%M%S") dim = args.dim recurrent_layer = CudnnGru(dim, w_init=TruncatedNormal(stddev=0.05)) params = trainer.TrainParams(trainer.SerializableOptimizer("Adadelta", dict(learning_rate=1.0)), ema=0.999, max_checkpoints_to_keep=2, async_encoding=10, num_epochs=24, log_period=30, eval_period=1200, save_period=1200, best_weights=("dev", "b17/text-f1"), eval_samples=dict(dev=None, train=8000)) lm_reduce = MapperSeq( ElmoLayer(args.l2, layer_norm=False, top_layer_only=args.top_layer_only), DropoutLayer(0.5), ) model = AttentionWithElmo( encoder=DocumentAndQuestionEncoder(SingleSpanAnswerEncoder()), lm_model=SquadContextConcatSkip(), append_before_atten=(args.mode == "both" or args.mode == "output"), append_embed=(args.mode == "both" or args.mode == "input"), max_batch_size=128, word_embed=FixedWordEmbedder(vec_name="glove.840B.300d", word_vec_init_scale=0, learn_unk=False, cpu=True), char_embed=CharWordEmbedder( LearnedCharEmbedder(word_size_th=14, char_th=49, char_dim=20, init_scale=0.05, force_cpu=True), MaxPool(Conv1d(100, 5, 0.8)), shared_parameters=True ), embed_mapper=SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), ), lm_reduce=None, lm_reduce_shared=lm_reduce, per_sentence=False, memory_builder=NullBiMapper(), attention=BiAttention(TriLinear(bias=True), True), match_encoder=SequenceMapperSeq(FullyConnected(dim * 2, activation="relu"), ResidualLayer(SequenceMapperSeq( VariationalDropoutLayer(0.8), recurrent_layer, VariationalDropoutLayer(0.8), StaticAttentionSelf(TriLinear(bias=True), ConcatWithProduct()), FullyConnected(dim * 2, activation="relu"), )), VariationalDropoutLayer(0.8)), predictor = BoundsPredictor(ChainBiMapper( first_layer=recurrent_layer, second_layer=recurrent_layer )) ) batcher = ClusteredBatcher(45, ContextLenKey(), False, False) data = DocumentQaTrainingData(SquadCorpus(), None, batcher, batcher) with open(__file__, "r") as f: notes = f.read() notes = str(sorted(args.__dict__.items(), key=lambda x:x[0])) + "\n" + notes trainer.start_training(data, model, params, [LossEvaluator(), SpanEvaluator(bound=[17], text_eval="squad")], ModelDir(out), notes)