def __init__(self): super(Decoder, self).__init__() self.attention_network = Attention() # Decoder self.embedding = nn.Embedding( config.vocab_size, config.emb_dim, weight_attr=paddle.ParamAttr(initializer=I.Normal( std=config.trunc_norm_init_std))) self.x_context = nn.Linear(config.hidden_dim * 2 + config.emb_dim, config.emb_dim) self.lstm = nn.LSTM( config.emb_dim, config.hidden_dim, num_layers=1, direction='forward', weight_ih_attr=paddle.ParamAttr( initializer=I.Uniform(low=-config.rand_unif_init_mag, high=config.rand_unif_init_mag)), bias_ih_attr=paddle.ParamAttr(initializer=I.Constant(value=0.0))) if config.pointer_gen: self.p_gen_linear = nn.Linear( config.hidden_dim * 4 + config.emb_dim, 1) self.out1 = nn.Linear(config.hidden_dim * 3, config.hidden_dim) self.out2 = nn.Linear( config.hidden_dim, config.vocab_size, weight_attr=paddle.ParamAttr(initializer=I.Normal( std=config.trunc_norm_init_std)))
def __init__(self, input_dim, num_layers): super(Highway, self).__init__() self._num_layers = num_layers self._highway_layers = [] for i in range(num_layers): paramAttr = paddle.ParamAttr( initializer=I.Normal(mean=0.0, std=1.0 / np.sqrt(input_dim))) paramAttr_b = paddle.ParamAttr(initializer=I.Constant(value=-2.0)) carry_linear = nn.Linear(input_dim, input_dim, weight_attr=paramAttr, bias_attr=paramAttr_b) self.add_sublayer('carry_linear_{}'.format(i), carry_linear) paramAttr = paddle.ParamAttr( initializer=I.Normal(mean=0.0, std=1.0 / np.sqrt(input_dim))) transform_linear = nn.Linear(input_dim, input_dim, weight_attr=paramAttr) self.add_sublayer('transform_linear_{}'.format(i), transform_linear) self._highway_layers.append([carry_linear, transform_linear]) self._relu = nn.ReLU() self._sigmoid = nn.Sigmoid()
def __init__(self): super(ReduceState, self).__init__() self.reduce_h = nn.Linear( config.hidden_dim * 2, config.hidden_dim, weight_attr=paddle.ParamAttr(initializer=I.Normal( std=config.trunc_norm_init_std))) self.reduce_c = nn.Linear( config.hidden_dim * 2, config.hidden_dim, weight_attr=paddle.ParamAttr(initializer=I.Normal( std=config.trunc_norm_init_std)))
def __init__(self): super(Encoder, self).__init__() # Initialized embeddings self.embedding = nn.Embedding( config.vocab_size, config.emb_dim, weight_attr=paddle.ParamAttr(initializer=I.Normal( std=config.trunc_norm_init_std))) # Initialized lstm weights self.lstm = nn.LSTM( config.emb_dim, config.hidden_dim, num_layers=1, direction='bidirect', weight_ih_attr=paddle.ParamAttr( initializer=I.Uniform(low=-config.rand_unif_init_mag, high=config.rand_unif_init_mag)), bias_ih_attr=paddle.ParamAttr(initializer=I.Constant(value=0.0))) # Initialized linear weights self.W_h = nn.Linear(config.hidden_dim * 2, config.hidden_dim * 2, bias_attr=False)
def _init_weights(self, m): if isinstance(m, nn.Linear): trunc_normal_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: zeros_(m.bias) elif isinstance(m, nn.LayerNorm): zeros_(m.bias) ones_(m.weight) elif isinstance(m, nn.Conv2D): fan_out = m._kernel_size[0] * m._kernel_size[1] * m._out_channels fan_out //= m._groups paddle_init.Normal(0, math.sqrt(2.0 / fan_out))(m.weight) if m.bias is not None: zeros_(m.bias)
def __init__(self, char_vocab_size, char_embed_dim, projection_dim, num_highways, cnn_filters, max_characters_per_token): super(ELMoCharacterEncoderLayer, self).__init__() self._use_highway = (num_highways > 0) self._n_filters = sum(f[1] for f in cnn_filters) self._use_proj = (self._n_filters != projection_dim) paramAttr = paddle.ParamAttr(initializer=I.Uniform(low=-1.0, high=1.0)) self._char_embedding_layer = nn.Embedding( num_embeddings=char_vocab_size, embedding_dim=char_embed_dim, weight_attr=paramAttr) self._char_embedding_layer.weight[0, :] = 0 self._convolution_layers = [] for i, (width, num) in enumerate(cnn_filters): paramAttr = paddle.ParamAttr( initializer=I.Uniform(low=-0.05, high=0.05)) conv2d = nn.Conv2D(in_channels=char_embed_dim, out_channels=num, kernel_size=(1, width), padding='Valid', data_format='NHWC', weight_attr=paramAttr) max_pool = nn.MaxPool2D(kernel_size=(1, max_characters_per_token - width + 1), stride=(1, 1), padding='Valid', data_format='NHWC') self.add_sublayer('cnn_layer_{}'.format(i), conv2d) self.add_sublayer('maxpool_layer_{}'.format(i), max_pool) self._convolution_layers.append([width, conv2d, max_pool]) self._relu = nn.ReLU() if self._use_highway: self._highway_layer = Highway(self._n_filters, num_highways) if self._use_proj: paramAttr = paddle.ParamAttr(initializer=I.Normal( mean=0.0, std=1.0 / np.sqrt(self._n_filters))) self._linear_layer = nn.Linear(self._n_filters, projection_dim, weight_attr=paramAttr)
def test_normal_initializer_default_value(self): """Test the normal initializer with default value """ paddle.enable_static() program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter(dtype="float32", shape=[5, 10], lod_level=0, name="param", initializer=initializer.Normal()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA) self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) paddle.disable_static()
def __init__(self, batch_size=None, char_embed_dim=16, projection_dim=512, vocab_size=None, cnn_filters=[[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]], char_vocab_size=262, max_characters_per_token=50, num_highways=2, num_layers=2, dropout=0.1, task='pre-train'): super(ELMo, self).__init__() if task == 'pre-train': if vocab_size is None or batch_size is None: raise ValueError( 'vocab_size and batch_size should be set when task="pre-train"' ) elif task == 'fine-tune': if batch_size is None: batch_size = 128 else: raise ValueError('task should be "pre-train" or "fine-tune"') self._projection_dim = projection_dim self._task = task self._token_embding_layer = ELMoCharacterEncoderLayer( char_vocab_size, char_embed_dim, projection_dim, num_highways, cnn_filters, max_characters_per_token) self._elmobilm = ELMoBiLM(batch_size, projection_dim, projection_dim, num_layers, dropout, task) if task == 'pre-train': paramAttr = paddle.ParamAttr(initializer=I.Normal( mean=0.0, std=1.0 / np.sqrt(projection_dim))) self._linear_layer = nn.Linear(projection_dim, vocab_size, weight_attr=paramAttr)
def test_normal_initializer(self, dtype="float32"): """Test normal initializer with supplied attributes """ paddle.enable_static() program = framework.Program() block = program.global_block() for _ in range(2): block.create_parameter(dtype=dtype, shape=[5, 10], lod_level=0, name="param", initializer=initializer.Normal(2.3, 1.9)) num_ops = 2 if dtype in ["float16", "uint16"] else 1 self.assertEqual(len(block.ops), num_ops) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA) self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA) paddle.disable_static() return block
def _get_default_param_initializer(channels): filter_elem_num = np.prod(self._kernel_size) * channels std = (2.0 / filter_elem_num)**0.5 return I.Normal(0.0, std)