def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None): discriminator_hidden_states = self.electra(input_ids, attention_mask, token_type_ids) #print(discriminator_hidden_states.shape) # (batch_size, max_length, hidden_size) discriminator_hidden_states = discriminator_hidden_states[0] #맨 위층 layer # (batch_size, max_length, hidden_size) -> (batch_size, hidden_size) lstm_output, (hidden, cell) = self.biLSTM(discriminator_hidden_states) #cls_output = lstm_output[:, 0, :] #[batch, length, hidden] cls_output = self.dropout(lstm_output) cls_output.unsqueeze_(1) conved = [conv(cls_output).squeeze(3) for conv in self.CNN] pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] concated = torch.cat(pooled, dim = 1) cls_output = self.linear_cnn(concated) # (batch_size, hidden_size) -> (batch_size, hidden_size) cls_output = self.linear_1(cls_output) cls_output = get_activation("gelu")(cls_output) cls_output = self.dropout(cls_output) # (batch_size, hidden_size) -> (batch_size, num_labels) cls_output = self.linear_2(cls_output) if labels is not None: loss_fct = nn.CrossEntropyLoss() loss = loss_fct(cls_output, labels) #*****loss_fct(예측값: 2차원(batch, 2:0,1), 정답값: 1차원(batch))***** return loss, self.softmax(cls_output) else: return self.softmax(cls_output)
def activate(var, method): """ An activation function. :param var: input var :param method: type of activation, such as `relu`,`tanh`,`sigmoid` """ from activations import get_activation return get_activation(method)(var)
def forward(self, features, **kwargs): x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = get_activation("gelu")(x) # although BERT uses tanh here, it seems Electra authors used gelu here x = self.dropout(x) x = self.out_proj(x) return x
def get_train_output(self, input): scores = T.dot(input, self.W) if self.use_bias: scores += self.b output = activations.get_activation(activ_type=self.activation, x=scores, leak_slope=self.leak_slope, clip_threshold=self.clip_threshold) return (output)
def __init__(self, rng, input, n_in, n_hidden, n_out, activations="tanh"): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function activation = act.get_activation(activations) self.hiddenLayer = HiddenLayer(rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=activation) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out) self.L1 = (abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum()) self.L2_sqr = ((self.hiddenLayer.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum()) # negative log likelihood of the MLP self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 # keep track of model input self.input = input
def forward(self, generator_hidden_states): hidden_states = self.dense(generator_hidden_states) hidden_states = get_activation("gelu")(hidden_states) hidden_states = self.LayerNorm(hidden_states) return hidden_states
def forward(self, discriminator_hidden_states): hidden_states = self.dense(discriminator_hidden_states) hidden_states = get_activation(self.config.hidden_act)(hidden_states) logits = self.dense_prediction(hidden_states).squeeze() return logits
def __init__(self, activation, **params): self.activation_name = activation self.activation = get_activation(self.activation_name) self._last_input = None super(Activation, self).__init__(**params)
def __init__(self, rng, input, n_in, n_hidden, n_out, activations="tanh", use_bias=True, dropout=False, dropout_rate=0): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function # # For Dropout, we basically need to set up two different MLPs # - one with dropout layers (used for training) and one for # prediction. [Question -- can we get error bounds if we run # forward propagation on the random dropout network a bunch of # times? Might these be calibrated probabilities? Probably not... # Not sure if this is necessary -- but just in case for now. if not dropout: dropout_rate = 0 activation = act.get_activation(activations) next_layer_input = input next_dropout_layer_input = _dropout_from_layer( rng, input, dropout_rate=dropout_rate) next_dropout_layer = DropoutHiddenLayer( rng=rng, input=next_dropout_layer_input, n_in=n_in, n_out=n_hidden, activation=activation, use_bias=use_bias, dropout_rate=dropout_rate) next_dropout_layer_input = next_dropout_layer.output # Reuse the parameters from the dropout layer here, in a different # path through the graph. # [Could be a constructor that takes a dropout hidden layer.] next_layer = HiddenLayer( rng=rng, input=next_layer_input, activation=activation, # scale the weight matrix W with probability of keeping W=next_dropout_layer.W * (1 - dropout_rate), b=next_dropout_layer.b, n_in=n_in, n_out=n_hidden, use_bias=use_bias) next_layer_input = next_layer.output # Now we set up the logistic regression (i.e. softmax) output # layers for the dropout network and the regular network self.dropout_output_layer = LogisticRegression( input=next_dropout_layer_input, n_in=n_hidden, n_out=n_out) self.output_layer = LogisticRegression( input=next_layer_input, n_in=n_hidden, n_out=n_out, W=self.dropout_output_layer.W * (1-dropout_rate), b=self.dropout_output_layer.b) # self.L1 = (abs(self.hiddenLayer.W).sum() # + abs(self.logRegressionLayer.W).sum()) # self.L2_sqr = ((self.hiddenLayer.W ** 2).sum() # + (self.logRegressionLayer.W ** 2).sum()) self.dropout_nll = self.dropout_output_layer.negative_log_likelihood self.dropout_errors = self.dropout_output_layer.errors self.nll = self.output_layer.negative_log_likelihood self.errors = self.output_layer.errors # The parameters for dropout and non-dropout are the same, but # we need to add the ones in the dropout layers, because those # are the shared variables... the ones in next_layer are # derived versions. self.params = self.dropout_output_layer.params + next_dropout_layer.params # keep track of model input self.input = input