def __init__(self, args, n_labels, default_lstm_init=False, **arg_dict): super(Network, self).__init__() self.feature_extractor = FeatureExtractionModule( args, default_lstm_init=default_lstm_init, **arg_dict) w_input_dim = self.feature_extractor.output_dim self.bilstm = BiLSTM(w_input_dim, args.lstm_dim, num_layers=args.lstm_layers, dropout=args.lstm_dropout) self.bilstm_dropout = SharedDropout(p=args.lstm_dropout) self.label_weights = BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.label_proj_dim, n_labels=n_labels, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False) self.span_weights = BatchedBiaffine( input_dim=args.lstm_dim * 2, proj_dim=args.span_proj_dim, n_labels=1, activation="leaky_relu", dropout=args.mlp_dropout, bias_y=True, # should be False but maybe there is a bug... output_bias=False)
def layer_forward(self, x, hx, cell, batch_sizes, reverse=False): hx_0 = hx_i = hx hx_n, output = [], [] steps = reversed(range(len(x))) if reverse else range(len(x)) if self.training: hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout) for t in steps: last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t] if last_batch_size < batch_size: hx_i = [ torch.cat((h, ih[last_batch_size:batch_size])) for h, ih in zip(hx_i, hx_0) ] else: hx_n.append([h[batch_size:] for h in hx_i]) hx_i = [h[:batch_size] for h in hx_i] hx_i = [h for h in cell(x[t], hx_i)] output.append(hx_i[0]) if self.training: hx_i[0] = hx_i[0] * hid_mask[:batch_size] if reverse: hx_n = hx_i output.reverse() else: hx_n.append(hx_i) hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))] output = torch.cat(output) return output, hx_n
def __init__(self, input_dim, n_heads, query_dim, values_dim, ff_hidden_dim, att_dropout=0., ff_dropout=0., residual_att_dropout=0., residual_ff_dropout=0., att_proj_bias=False, shared_dropout=True, pre_ln=False, ball_norm=False): super(Layer, self).__init__() self.self_attn = MultiHeadAttention( input_dim=input_dim, n_heads=n_heads, query_dim=query_dim, values_dim=values_dim, output_dim= input_dim, # must be the same size because of the residual connection att_dropout=att_dropout, att_proj_bias=att_proj_bias) self.feed_forward = PositionwiseFeedForward( input_dim, hidden_dim=ff_hidden_dim, dropout=ff_dropout, shared_dropout=shared_dropout) self.layer_norm1 = LayerNorm(input_dim, ball=ball_norm) self.layer_norm2 = LayerNorm(input_dim, ball=ball_norm) self.pre_ln = pre_ln if shared_dropout: # ok because input is (batch, n word, features) self.dropout1 = SharedDropout(residual_att_dropout) self.dropout2 = SharedDropout(residual_ff_dropout) else: self.dropout1 = nn.Dropout(residual_att_dropout) self.dropout2 = nn.Dropout(residual_ff_dropout)
def __init__(self, input_dim, hidden_dim, dropout=0.1, shared_dropout=True): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(input_dim, hidden_dim) self.w_2 = nn.Linear(hidden_dim, input_dim) self.relu = nn.ReLU() if shared_dropout: # we can use it because input is (batch, n word, n features) self.relu_dropout = SharedDropout(dropout) else: self.relu_dropout = nn.Dropout(dropout)
def __init__(self, dim_input, dim_output, dropout=0, activation="tanh", shared_dropout=True, negative_slope=0.1): super(MLP, self).__init__() if activation == "tanh": activation = nn.Tanh() elif activation == "relu": activation = nn.ReLU() elif activation == "elu": activation = nn.ELU() elif activation == "leaky_relu": activation = nn.LeakyReLU(negative_slope=negative_slope) else: raise RuntimeError("Unknown activation function: %s" % activation) self.seq = nn.Sequential( nn.Linear(dim_input, dim_output), activation, SharedDropout(p=dropout) if shared_dropout else nn.Dropout(dropout) ) self.reset_parameters()
def forward(self, sequence, hx=None): x, batch_sizes = sequence.data, sequence.batch_sizes.tolist() batch_size = batch_sizes[0] h_n, c_n = [], [] if hx is None: ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size) h, c = ih, ih else: h, c = self.permute_hidden(hx, sequence.sorted_indices) h = h.view(self.num_layers, 2, batch_size, self.hidden_size) c = c.view(self.num_layers, 2, batch_size, self.hidden_size) for i in range(self.num_layers): x = torch.split(x, batch_sizes) if self.training: mask = SharedDropout.get_mask(x[0], self.dropout) x = [i * mask[:len(i)] for i in x] x_f, (h_f, c_f) = self.layer_forward(x=x, hx=(h[i, 0], c[i, 0]), cell=self.f_cells[i], batch_sizes=batch_sizes) x_b, (h_b, c_b) = self.layer_forward(x=x, hx=(h[i, 1], c[i, 1]), cell=self.b_cells[i], batch_sizes=batch_sizes, reverse=True) x = torch.cat((x_f, x_b), -1) h_n.append(torch.stack((h_f, h_b))) c_n.append(torch.stack((c_f, c_b))) x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices, sequence.unsorted_indices) hx = torch.cat(h_n, 0), torch.cat(c_n, 0) hx = self.permute_hidden(hx, sequence.unsorted_indices) return x, hx
def __init__(self, args, n_cont_labels, n_disc_labels, n_tags=-1, default_lstm_init=False, old=False, **arg_dict): super(BiaffineParserNetwork, self).__init__() if n_cont_labels == 0 or n_disc_labels == 0: raise RuntimeError("Cannot instantiate if number of labels=0") self.feature_extractor = FeatureExtractionModule( args, n_tags=n_tags, default_lstm_init=default_lstm_init, **arg_dict) w_input_dim = self.feature_extractor.output_dim self.bilstms = nn.ModuleList( BiLSTM(w_input_dim if i == 0 else args.lstm_dim * 2, args.lstm_dim, num_layers=args.lstm_layers, dropout=args.lstm_dropout) for i in range(args.lstm_stacks)) self.bilstm_dropout = SharedDropout(p=args.lstm_dropout) if args.tagger: if n_tags <= 0: raise RuntimeError("Invalid number of tags") self.tagger = nn.Linear(args.lstm_dim * 2, n_tags if old else n_tags - 2, bias=True) if not (args.tagger_stack >= 1 and args.tagger_stack <= args.lstm_stacks): raise RuntimeError("Invalid stack index") self.tagger_stack = args.tagger_stack else: self.tagger = None self.label_weights = nn.ModuleDict({ "cont": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.label_proj_dim, n_labels=n_cont_labels, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), "disc": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.label_proj_dim, n_labels=n_disc_labels, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), "gap": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.label_proj_dim, n_labels=n_disc_labels, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), }) self.span_weights = nn.ModuleDict({ "cont": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.span_proj_dim, n_labels=1, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), "disc": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.span_proj_dim, n_labels=1, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), "gap": BatchedBiaffine(input_dim=args.lstm_dim * 2, proj_dim=args.span_proj_dim, n_labels=1, activation="leaky_relu", dropout=args.mlp_dropout, output_bias=False), })