def __init__(self, num_class, input_dim, kernel_nums, kernel_sizes: list, max_kernel_size=50, dropout_rate=0.5): super().__init__() self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=num, kernel_size=(width, input_dim)) for (num, width) in zip(kernel_nums, kernel_sizes) ], ) # self.bias = [nn.Parameter(torch.zeros())] self.highway_layer = Highway(input_dim=sum(kernel_nums), num_layers=1) self.dropout_layer = nn.Dropout(dropout_rate) self.feedforward_layer = nn.Linear(sum(kernel_nums), num_class) self.max_kernel_size = max_kernel_size
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(LBLHighwayBiLmV2, self).__init__() self.use_position = use_position self.n_layers = n_layers = n_layers self.n_highway = n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.width = width self.input_size = input_size self.hidden_size = hidden_size forward_scores, backward_scores = [], [] forward_blocks, backward_blocks = [], [] for _ in range(n_layers): forward_scores.append(torch.nn.Parameter(torch.randn(width + 1))) backward_scores.append(torch.nn.Parameter(torch.randn(width + 1))) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) self.forward_weights = torch.nn.ParameterList(forward_scores) self.backward_weights = torch.nn.ParameterList(backward_scores) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def define_embedding(self, H, char_arr, rel_arr, def_arr): """ Define the embedding for different methods. """ if H.joint_emb is not None: self._jdrop = nn.Dropout(H.joint_dropout if self.use_dropout else 0) if H.char_emb or H.cnnsoftmax: self.char_arr = torch.LongTensor(char_arr).cuda() self.rel_arr, self.def_arr = None, None self._char_emb = nn.Embedding(262, H.char_emsize).cuda() self._char_network = nn.ModuleList() self._char_network.append(nn.Conv1d(H.char_emsize, 32, 1, stride=(1,)).cuda()) self._char_network.append(nn.Conv1d(H.char_emsize, 32, 2, stride=(1,)).cuda()) self._char_network.append(nn.Conv1d(H.char_emsize, 64, 3, stride=(2,)).cuda()) self._char_network.append(nn.Conv1d(H.char_emsize, 128, 4, stride=(3,)).cuda()) self._char_network.append(nn.Conv1d(H.char_emsize, 256, 5, stride=(4,)).cuda()) self._char_network.append(nn.Conv1d(H.char_emsize, 512, 6, stride=(5,)).cuda()) self._conv_activation = eval("torch.nn.functional.%s" % H.char_activation) if not H.char_nohighways: self._char_highways = Highway(1024, H.hdepth, activation=self._conv_activation) self._char_linear = nn.Linear(1024, H.emsize, bias=False) nforms = 1 if rel_arr: self.rel_arr = self.coverage_filter(torch.LongTensor(rel_arr).cuda()) nforms += 1 if def_arr: self.def_arr = self.coverage_filter(torch.LongTensor(def_arr).cuda()) nforms += 1 self.rel_exist = self.rel_arr is not None self.def_exist = self.def_arr is not None self.nforms = H.nforms = nforms if H.defenc == "lstm": if def_arr: defsize = self.def_arr.shape[1] def_h = torch.zeros(H.hdepth,defsize,H.emsize).cuda() self.def_hid = (def_h,def_h) if rel_arr: relsize = self.rel_arr.shape[1] rel_h = torch.zeros(H.hdepth,relsize,H.emsize).cuda() self.rel_hid = (rel_h, rel_h) self._definition_network = torch.nn.LSTM(H.emsize, H.emsize, num_layers=H.hdepth) elif H.defenc == "highway": self._definition_network = Highway(H.emsize, H.hdepth, activation=self._conv_activation) if H.combine == "concat": self._comb_lin = nn.Linear(H.emsize*H.nforms, H.emsize, bias=True) if H.cnnsoftmax or H.char_emb: if H.cnnsoftmax: self._lookup = nn.Embedding(H.ntoken, H.emsize) if H.cnncorr: self._cnnsoftmax_correction = nn.Linear(H.cnncorr, H.ntoken, bias=False) self._cnnsoftmax_M = nn.Linear(H.cnncorr, H.emsize, bias=False) else: self._lookup = nn.Embedding(H.ntoken, H.emsize)
def __init__(self, width: int, input_size: int, hidden_size: int, n_heads: int, n_layers: int, n_highway: int, use_position: bool = False, use_relative_position: bool = False, dropout: float = 0.0): super(SelfAttentiveLBLBiLMV3, self).__init__() self.use_position = use_position self.use_relative_position_weights = use_relative_position self.n_layers = n_layers self.n_highway = n_highway self.n_heads = n_heads self.input_size = input_size self.width = width self.hidden_size = hidden_size forward_attns, backward_attns = [], [] forward_blocks, backward_blocks = [], [] for _ in range(n_layers): if self.use_relative_position_weights: forward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings( n_heads, hidden_size, width=width + 1, left_to_right=True, dropout=dropout) backward_attn = MultiHeadedAttentionWithRelativePositionEmbeddings( n_heads, hidden_size, width=width + 1, left_to_right=False, dropout=dropout) else: forward_attn = MultiHeadedAttention(n_heads, hidden_size, dropout) backward_attn = MultiHeadedAttention(n_heads, hidden_size, dropout) forward_attns.append(forward_attn) backward_attns.append(backward_attn) forward_blocks.append(Highway(hidden_size, n_highway)) backward_blocks.append(Highway(hidden_size, n_highway)) self.forward_attns = torch.nn.ModuleList(forward_attns) self.backward_attns = torch.nn.ModuleList(backward_attns) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(Bengio03HighwayBiLmV2, self).__init__() self.use_position = use_position self.n_layers = n_layers self.n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.activation = torch.nn.ReLU() self.width = width self.input_size = input_size self.context_input_size = input_size * (width + 1) self.hidden_size = hidden_size self.forward_paddings = torch.nn.ModuleList([ torch.nn.ConstantPad2d((0, 0, length, 0), 0) for length in range(width + 1) ]) self.backward_paddings = torch.nn.ModuleList([ torch.nn.ConstantPad2d((0, 0, 0, length), 0) for length in range(width + 1) ]) forward_blocks = [] backward_blocks = [] for layer_index in range(self.n_layers): forward_layer = torch.nn.ModuleList([ torch.nn.Linear(input_size, hidden_size, bias=False) for _ in range(width + 1) ]) backward_layer = torch.nn.ModuleList([ torch.nn.Linear(input_size, hidden_size, bias=False) for _ in range(width + 1) ]) self.add_module('forward_layer_{}'.format(layer_index), forward_layer) self.add_module('backward_layer_{}'.format(layer_index), backward_layer) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size)
def __init__(self, width: int, input_size: int, hidden_size: int, n_layers: int, n_highway: int, use_position: bool = False, dropout: float = 0.0): super(Bengio03HighwayBiLm, self).__init__() self.use_position = use_position self.n_layers = n_layers self.n_highway = n_highway self.dropout = torch.nn.Dropout(p=dropout) self.activation = torch.nn.ReLU() self.width = width self.input_size = input_size self.context_input_size = input_size * (width + 1) self.hidden_size = hidden_size forward_paddings, backward_paddings = [], [] forward_blocks, backward_blocks = [], [] forward_projects, backward_projects = [], [] for i in range(n_layers): forward_paddings.append( torch.nn.Parameter(torch.randn(width, hidden_size))) backward_paddings.append( torch.nn.Parameter(torch.randn(width, hidden_size))) forward_blocks.append(Highway(hidden_size, num_layers=n_highway)) backward_blocks.append(Highway(hidden_size, num_layers=n_highway)) forward_projects.append( torch.nn.Linear(self.context_input_size, hidden_size)) backward_projects.append( torch.nn.Linear(self.context_input_size, hidden_size)) self.forward_projects = torch.nn.ModuleList(forward_projects) self.backward_projects = torch.nn.ModuleList(backward_projects) self.forward_paddings = torch.nn.ParameterList(forward_paddings) self.backward_paddings = torch.nn.ParameterList(backward_paddings) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_position: self.position = PositionalEncoding(hidden_size) self.reset_parameters()
def _load_highway(self): # pylint: disable=protected-access # the highway layers have same dimensionality as the number of cnn filters cnn_options = self._options['char_cnn'] filters = cnn_options['filters'] n_filters = sum(f[1] for f in filters) n_highway = cnn_options['n_highway'] # create the layers, and load the weights self._highways = Highway(n_filters, n_highway, activation=torch.nn.functional.relu) for k in range(n_highway): # The AllenNLP highway is one matrix multplication with concatenation of # transform and carry weights. with h5py.File(cached_path(self._weight_file), 'r') as fin: # The weights are transposed due to multiplication order assumptions in tf # vs pytorch (tf.matmul(X, W) vs pytorch.matmul(W, X)) w_transform = numpy.transpose( fin['CNN_high_{}'.format(k)]['W_transform'][...]) # -1.0 since AllenNLP is g * x + (1 - g) * f(x) but tf is (1 - g) * x + g * f(x) w_carry = -1.0 * numpy.transpose( fin['CNN_high_{}'.format(k)]['W_carry'][...]) weight = numpy.concatenate([w_transform, w_carry], axis=0) self._highways._layers[k].weight.data.copy_( torch.FloatTensor(weight)) self._highways._layers[k].weight.requires_grad = False b_transform = fin['CNN_high_{}'.format(k)]['b_transform'][...] b_carry = -1.0 * fin['CNN_high_{}'.format(k)]['b_carry'][...] bias = numpy.concatenate([b_transform, b_carry], axis=0) self._highways._layers[k].bias.data.copy_( torch.FloatTensor(bias)) self._highways._layers[k].bias.requires_grad = False
def __init__(self, output_dim: int, word_embedder: Embeddings, char_embedder: Embeddings, filters: List[Tuple[int, int]], n_highway: int, activation: str): super(ConvTokenEmbedder, self).__init__(output_dim, word_embedder, char_embedder) self.emb_dim = 0 if word_embedder is not None: self.emb_dim += word_embedder.n_d if char_embedder is not None: self.convolutions = [] char_embed_dim = char_embedder.n_d for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=char_embed_dim, out_channels=num, kernel_size=width, bias=True) self.convolutions.append(conv) self.convolutions = torch.nn.ModuleList(self.convolutions) self.n_filters = sum(f[1] for f in filters) self.n_highway = n_highway self.highways = Highway(self.n_filters, self.n_highway, activation=Activation.by_name("relu")()) self.emb_dim += self.n_filters self.activation = Activation.by_name(activation)() self.projection = torch.nn.Linear(self.emb_dim, self.output_dim, bias=True)
class CNNClassifier(nn.Module): """Encodes a sequence of word embeddings""" def __init__(self, num_class, input_dim, kernel_nums, kernel_sizes: list, max_kernel_size=50, dropout_rate=0.5): super().__init__() self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=num, kernel_size=(width, input_dim)) for (num, width) in zip(kernel_nums, kernel_sizes) ], ) # self.bias = [nn.Parameter(torch.zeros())] self.highway_layer = Highway(input_dim=sum(kernel_nums), num_layers=1) self.dropout_layer = nn.Dropout(dropout_rate) self.feedforward_layer = nn.Linear(sum(kernel_nums), num_class) self.max_kernel_size = max_kernel_size def forward(self, x): # x : [batch size, seq len, input dim] if x.size(1) < self.max_kernel_size: pd = [0, 0, 0, self.max_kernel_size - x.size(1)] # [batch size, max seq len, input dim] x = f.pad(x, pd, 'constant', 0) # x : [batch size, kernel num, max seq len, input dim] x = x.unsqueeze(1) # x : [batch size, kernel num, max seq_len - width] x = [torch.relu(conv(x).squeeze(-1)) for conv in self.convs] # x = [torch.max_pool1d(x_, x_.size(-1)).squeeze(-1) for x_ in x] x = [torch.avg_pool1d(x_, x_.size(-1)).squeeze(-1) for x_ in x] # [batch size, sum(kernel_num)] x = torch.cat(x, dim=-1) x = self.highway_layer.forward(x) # [batch size, num_class] logit = torch.log_softmax( self.feedforward_layer(self.dropout_layer(x)), -1) # [batch size] return logit
def __init__(self, output_dim: int, embeddings: Embeddings, filters: List[Tuple[int, int]], n_highway: int, activation: str, use_cuda: bool, input_field_name: str = None): super(ConvTokenEmbedder, self).__init__(input_field_name) self.embeddings = embeddings self.output_dim = output_dim self.use_cuda = use_cuda self.filters = filters convolutions = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=embeddings.n_d, out_channels=num, kernel_size=width, bias=True) convolutions.append(conv) self.convolutions = torch.nn.ModuleList(convolutions) self.n_filters = sum(f[1] for f in filters) self.n_highway = n_highway self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu) self.activation = Activation.by_name(activation)() self.projection = torch.nn.Linear(self.n_filters, output_dim, bias=True) self.reset_parameters()
def __init__( self, embedding_dim: int, filters: Sequence[Sequence[int]], num_highway: int, projection_dim: int, activation: str = "relu", projection_location: str = "after_highway", do_layer_norm: bool = False, ) -> None: super().__init__() if projection_location not in _VALID_PROJECTION_LOCATIONS: raise ConfigurationError( f"unknown projection location: {projection_location}") self.input_dim = embedding_dim self.output_dim = projection_dim self._projection_location = projection_location if activation == "tanh": self._activation = torch.nn.functional.tanh elif activation == "relu": self._activation = torch.nn.functional.relu else: raise ConfigurationError(f"unknown activation {activation}") # Create the convolutions self._convolutions: List[torch.nn.Module] = [] for i, (width, num) in enumerate(filters): conv = torch.nn.Conv1d(in_channels=embedding_dim, out_channels=num, kernel_size=width, bias=True) conv.weight.data.uniform_(-0.05, 0.05) conv.bias.data.fill_(0.0) self.add_module(f"char_conv_{i}", conv) # needs to match the old ELMo name self._convolutions.append(conv) # Create the highway layers num_filters = sum(num for _, num in filters) if projection_location == "after_cnn": highway_dim = projection_dim else: # highway_dim is the number of cnn filters highway_dim = num_filters self._highways = Highway(highway_dim, num_highway, activation=torch.nn.functional.relu) for highway_layer in self._highways._layers: # highway is a linear layer for each highway layer # with fused W and b weights highway_layer.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / highway_dim)) highway_layer.bias[:highway_dim].data.fill_(0.0) highway_layer.bias[highway_dim:].data.fill_(2.0) # Projection layer: always num_filters -> projection_dim self._projection = torch.nn.Linear(num_filters, projection_dim, bias=True) self._projection.weight.data.normal_(mean=0.0, std=np.sqrt(1.0 / num_filters)) self._projection.bias.data.fill_(0.0) # And add a layer norm if do_layer_norm: self._layer_norm: Callable = LayerNorm(self.output_dim) else: self._layer_norm = lambda tensor: tensor
def __init__(self, width: int, input_size: int, hidden_size: int, n_heads: int, n_layers: int, n_highway: int, use_position: bool = False, use_relative_position: bool = False, dropout: float = 0.0): super(SelfAttentiveLBLBiLM, self).__init__() self.use_position = use_position self.use_relative_position_weights = use_relative_position self.n_layers = n_layers self.n_highway = n_highway self.n_heads = n_heads self.input_size = input_size self.width = width self.hidden_size = hidden_size forward_attns, backward_attns = [], [] forward_paddings, backward_paddings = [], [] forward_blocks, backward_blocks = [], [] forward_weights, backward_weights = [], [] for _ in range(n_layers): forward_attns.append( MultiHeadedAttention(n_heads, hidden_size, dropout)) backward_attns.append( MultiHeadedAttention(n_heads, hidden_size, dropout)) forward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) backward_paddings.append( torch.nn.Parameter( torch.randn(width, hidden_size) / np.sqrt(hidden_size))) forward_blocks.append(Highway(hidden_size, n_highway)) backward_blocks.append(Highway(hidden_size, n_highway)) if self.use_relative_position_weights: forward_weights.append( torch.nn.Parameter(torch.randn(width + 1))) backward_weights.append( torch.nn.Parameter(torch.randn(width + 1))) self.forward_attns = torch.nn.ModuleList(forward_attns) self.backward_attns = torch.nn.ModuleList(backward_attns) self.forward_paddings = torch.nn.ParameterList(forward_paddings) self.backward_paddings = torch.nn.ParameterList(backward_paddings) self.forward_blocks = torch.nn.ModuleList(forward_blocks) self.backward_blocks = torch.nn.ModuleList(backward_blocks) if self.use_relative_position_weights: self.forward_weights = torch.nn.ParameterList(forward_weights) self.backward_weights = torch.nn.ParameterList(backward_weights) if self.use_position: self.position = PositionalEncoding(hidden_size)