def __init__(self, args, n_labels, default_lstm_init=False, **arg_dict):
        super(Network, self).__init__()

        self.feature_extractor = FeatureExtractionModule(
            args, default_lstm_init=default_lstm_init, **arg_dict)
        w_input_dim = self.feature_extractor.output_dim

        self.bilstm = BiLSTM(w_input_dim,
                             args.lstm_dim,
                             num_layers=args.lstm_layers,
                             dropout=args.lstm_dropout)
        self.bilstm_dropout = SharedDropout(p=args.lstm_dropout)

        self.label_weights = BatchedBiaffine(input_dim=args.lstm_dim * 2,
                                             proj_dim=args.label_proj_dim,
                                             n_labels=n_labels,
                                             activation="leaky_relu",
                                             dropout=args.mlp_dropout,
                                             output_bias=False)
        self.span_weights = BatchedBiaffine(
            input_dim=args.lstm_dim * 2,
            proj_dim=args.span_proj_dim,
            n_labels=1,
            activation="leaky_relu",
            dropout=args.mlp_dropout,
            bias_y=True,  # should be False but maybe there is a bug...
            output_bias=False)
    def layer_forward(self, x, hx, cell, batch_sizes, reverse=False):
        hx_0 = hx_i = hx
        hx_n, output = [], []
        steps = reversed(range(len(x))) if reverse else range(len(x))
        if self.training:
            hid_mask = SharedDropout.get_mask(hx_0[0], self.dropout)

        for t in steps:
            last_batch_size, batch_size = len(hx_i[0]), batch_sizes[t]
            if last_batch_size < batch_size:
                hx_i = [
                    torch.cat((h, ih[last_batch_size:batch_size]))
                    for h, ih in zip(hx_i, hx_0)
                ]
            else:
                hx_n.append([h[batch_size:] for h in hx_i])
                hx_i = [h[:batch_size] for h in hx_i]
            hx_i = [h for h in cell(x[t], hx_i)]
            output.append(hx_i[0])
            if self.training:
                hx_i[0] = hx_i[0] * hid_mask[:batch_size]
        if reverse:
            hx_n = hx_i
            output.reverse()
        else:
            hx_n.append(hx_i)
            hx_n = [torch.cat(h) for h in zip(*reversed(hx_n))]
        output = torch.cat(output)

        return output, hx_n
示例#3
0
    def __init__(self,
                 input_dim,
                 n_heads,
                 query_dim,
                 values_dim,
                 ff_hidden_dim,
                 att_dropout=0.,
                 ff_dropout=0.,
                 residual_att_dropout=0.,
                 residual_ff_dropout=0.,
                 att_proj_bias=False,
                 shared_dropout=True,
                 pre_ln=False,
                 ball_norm=False):
        super(Layer, self).__init__()

        self.self_attn = MultiHeadAttention(
            input_dim=input_dim,
            n_heads=n_heads,
            query_dim=query_dim,
            values_dim=values_dim,
            output_dim=
            input_dim,  # must be the same size because of the residual connection
            att_dropout=att_dropout,
            att_proj_bias=att_proj_bias)
        self.feed_forward = PositionwiseFeedForward(
            input_dim,
            hidden_dim=ff_hidden_dim,
            dropout=ff_dropout,
            shared_dropout=shared_dropout)

        self.layer_norm1 = LayerNorm(input_dim, ball=ball_norm)
        self.layer_norm2 = LayerNorm(input_dim, ball=ball_norm)
        self.pre_ln = pre_ln

        if shared_dropout:
            # ok because input is (batch, n word, features)
            self.dropout1 = SharedDropout(residual_att_dropout)
            self.dropout2 = SharedDropout(residual_ff_dropout)
        else:
            self.dropout1 = nn.Dropout(residual_att_dropout)
            self.dropout2 = nn.Dropout(residual_ff_dropout)
示例#4
0
    def __init__(self,
                 input_dim,
                 hidden_dim,
                 dropout=0.1,
                 shared_dropout=True):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(input_dim, hidden_dim)
        self.w_2 = nn.Linear(hidden_dim, input_dim)
        self.relu = nn.ReLU()

        if shared_dropout:
            # we can use it because input is (batch, n word, n features)
            self.relu_dropout = SharedDropout(dropout)
        else:
            self.relu_dropout = nn.Dropout(dropout)
    def __init__(self, dim_input, dim_output, dropout=0, activation="tanh", shared_dropout=True, negative_slope=0.1):
        super(MLP, self).__init__()


        if activation == "tanh":
            activation = nn.Tanh()
        elif activation == "relu":
            activation = nn.ReLU()
        elif activation == "elu":
            activation = nn.ELU()
        elif activation == "leaky_relu":
            activation = nn.LeakyReLU(negative_slope=negative_slope)
        else:
            raise RuntimeError("Unknown activation function: %s" % activation)

        self.seq = nn.Sequential(
            nn.Linear(dim_input, dim_output),
            activation,
            SharedDropout(p=dropout) if shared_dropout else nn.Dropout(dropout)
        )

        self.reset_parameters()
    def forward(self, sequence, hx=None):
        x, batch_sizes = sequence.data, sequence.batch_sizes.tolist()
        batch_size = batch_sizes[0]
        h_n, c_n = [], []

        if hx is None:
            ih = x.new_zeros(self.num_layers * 2, batch_size, self.hidden_size)
            h, c = ih, ih
        else:
            h, c = self.permute_hidden(hx, sequence.sorted_indices)
        h = h.view(self.num_layers, 2, batch_size, self.hidden_size)
        c = c.view(self.num_layers, 2, batch_size, self.hidden_size)

        for i in range(self.num_layers):
            x = torch.split(x, batch_sizes)
            if self.training:
                mask = SharedDropout.get_mask(x[0], self.dropout)
                x = [i * mask[:len(i)] for i in x]
            x_f, (h_f, c_f) = self.layer_forward(x=x,
                                                 hx=(h[i, 0], c[i, 0]),
                                                 cell=self.f_cells[i],
                                                 batch_sizes=batch_sizes)
            x_b, (h_b, c_b) = self.layer_forward(x=x,
                                                 hx=(h[i, 1], c[i, 1]),
                                                 cell=self.b_cells[i],
                                                 batch_sizes=batch_sizes,
                                                 reverse=True)
            x = torch.cat((x_f, x_b), -1)
            h_n.append(torch.stack((h_f, h_b)))
            c_n.append(torch.stack((c_f, c_b)))
        x = PackedSequence(x, sequence.batch_sizes, sequence.sorted_indices,
                           sequence.unsorted_indices)
        hx = torch.cat(h_n, 0), torch.cat(c_n, 0)
        hx = self.permute_hidden(hx, sequence.unsorted_indices)

        return x, hx
    def __init__(self,
                 args,
                 n_cont_labels,
                 n_disc_labels,
                 n_tags=-1,
                 default_lstm_init=False,
                 old=False,
                 **arg_dict):
        super(BiaffineParserNetwork, self).__init__()

        if n_cont_labels == 0 or n_disc_labels == 0:
            raise RuntimeError("Cannot instantiate if number of labels=0")

        self.feature_extractor = FeatureExtractionModule(
            args,
            n_tags=n_tags,
            default_lstm_init=default_lstm_init,
            **arg_dict)
        w_input_dim = self.feature_extractor.output_dim

        self.bilstms = nn.ModuleList(
            BiLSTM(w_input_dim if i == 0 else args.lstm_dim * 2,
                   args.lstm_dim,
                   num_layers=args.lstm_layers,
                   dropout=args.lstm_dropout) for i in range(args.lstm_stacks))
        self.bilstm_dropout = SharedDropout(p=args.lstm_dropout)

        if args.tagger:
            if n_tags <= 0:
                raise RuntimeError("Invalid number of tags")
            self.tagger = nn.Linear(args.lstm_dim * 2,
                                    n_tags if old else n_tags - 2,
                                    bias=True)
            if not (args.tagger_stack >= 1
                    and args.tagger_stack <= args.lstm_stacks):
                raise RuntimeError("Invalid stack index")
            self.tagger_stack = args.tagger_stack
        else:
            self.tagger = None

        self.label_weights = nn.ModuleDict({
            "cont":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.label_proj_dim,
                            n_labels=n_cont_labels,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
            "disc":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.label_proj_dim,
                            n_labels=n_disc_labels,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
            "gap":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.label_proj_dim,
                            n_labels=n_disc_labels,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
        })
        self.span_weights = nn.ModuleDict({
            "cont":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.span_proj_dim,
                            n_labels=1,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
            "disc":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.span_proj_dim,
                            n_labels=1,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
            "gap":
            BatchedBiaffine(input_dim=args.lstm_dim * 2,
                            proj_dim=args.span_proj_dim,
                            n_labels=1,
                            activation="leaky_relu",
                            dropout=args.mlp_dropout,
                            output_bias=False),
        })