def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() # MLM head is not trained for param in self.cls.parameters(): param.requires_grad = False
def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.loss_fct = CrossEntropyLoss() # -100 index = padding token; initialize once to speed up. self.init_weights()
class LOTClassModel(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config, add_pooling_layer=False) self.cls = BertOnlyMLMHead(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.activation = nn.Tanh() self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() # MLM head is not trained for param in self.cls.parameters(): param.requires_grad = False def forward(self, input_ids, pred_mode, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None): bert_outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds) last_hidden_states = bert_outputs[0] if pred_mode == "classification": trans_states = self.dense(last_hidden_states) trans_states = self.activation(trans_states) trans_states = self.dropout(trans_states) logits = self.classifier(trans_states) elif pred_mode == "mlm": logits = self.cls(last_hidden_states) else: sys.exit("Wrong pred_mode!") return logits
def __init__(self, config): super(GlyceBertForMaskedLM, self).__init__(config) self.bert = GlyceBertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights()
def __init__(self, config, tokenized_slot_meta, pad_idx=0): super(TRADE, self).__init__() self.model_type = config.model_type if self.model_type == "BERT": ### model_type 에 맞는 Encoder 사용 self.encoder = BERTEncoder(config.model_name_or_path, ) elif self.model_type == "GRU": self.encoder = GRUEncoder( config.vocab_size, config.hidden_size, 1, config.hidden_dropout_prob, config.proj_dim, pad_idx, ) self.decoder = SlotGenerator( config.vocab_size, config.hidden_size, config.hidden_dropout_prob, config.n_gate, config.proj_dim, pad_idx, ) self.decoder.set_slot_idx(tokenized_slot_meta) self.mlm_head = BertOnlyMLMHead(config) self.tie_weight()
def __init__(self, config: Config, *args, **kwargs): super().__init__(config, *args, **kwargs) self.cls = BertOnlyMLMHead(self.config) loss_dict = dict( mse=torch.nn.MSELoss(), cosine=torch.nn.CosineSimilarity(dim=1), contrastive=RefinerContrastiveLoss(), ms=RefinerMSLoss(), ) self.refiner_loss = loss_dict.get(self.config.loss_type) self.refiner_decoder = {} self.weights = {} for i, modality in enumerate(self.config.modalities): self.refiner_decoder[modality] = MLP( input_dim=self.config.hidden_size, mlp_dims=[self.config.hidden_size], dropout=self.config.hidden_dropout_prob, nonlinearity=torch.nn.ReLU, normalization=torch.nn.LayerNorm, ) self.weights[modality] = self.config.weights[i] self.modalities = self.config.modalities self.tol = self.config.tol self.refiner_target_pooler = self.config.refiner_target_pooler self.refiner_target_layer_depth = self.config.refiner_target_layer_depth self.loss_name = self.config.loss_name pool_class = registry.get_pool_class(self.refiner_target_pooler) if pool_class is None: raise ValueError(f"No pooler {self.refiner_target_pooler} is\ registered to registry") self.pooler = pool_class(self.refiner_target_layer_depth)
def __init__(self, config, mask_word_id=0, search_beam_size=1, length_penalty=1.0, eos_id=0, sos_id=0, forbid_duplicate_ngrams=False, forbid_ignore_set=None, ngram_size=3, min_len=0): super(UnilmForSeq2SeqDecode, self).__init__(config) self.bert = UnilmModelIncr(config) self.cls = BertOnlyMLMHead(config) self.crit_mask_lm = nn.CrossEntropyLoss(reduction='none') self.mask_word_id = mask_word_id self.search_beam_size = search_beam_size self.length_penalty = length_penalty self.eos_id = eos_id self.sos_id = sos_id self.forbid_duplicate_ngrams = forbid_duplicate_ngrams self.forbid_ignore_set = forbid_ignore_set self.ngram_size = ngram_size self.min_len = min_len self.init_weights() self.tie_weights()
def __init__(self, config, args, tokenizer): super(DecoderWithLoss, self).__init__() # model components print("initializing decoder with params {}".format(args)) self.bert = BertModel(config) self.lm_head = BertOnlyMLMHead(config) self.span_b_proj = nn.ModuleList([ HighwayLayer(config.hidden_size) for _ in range(args.num_highway) ]) self.span_e_proj = nn.ModuleList([ HighwayLayer(config.hidden_size) for _ in range(args.num_highway) ]) # predict text span beginning and end self.text_span_start_head = nn.Linear(config.hidden_size, config.hidden_size) self.text_span_end_head = nn.Linear(config.hidden_size, config.hidden_size) # loss functions if args.node_label_smoothing > 0: self.lm_ce_loss = LabelSmoothingLoss( args.node_label_smoothing, config.vocab_size, ignore_index=tokenizer.pad_token_id) else: self.lm_ce_loss = torch.nn.CrossEntropyLoss( ignore_index=tokenizer.pad_token_id, reduction="none") self.span_ce_loss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction="none") self.span_loss_lb = args.lambda_span_loss self.text_span_loss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction="none") self.tree_to_text = args.tree_to_text
def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights()
def __init__(self, config: BertConfig, **kwargs: Any): """The classification init is a super set of LM init""" super().__init__(config, **kwargs) self.config = config self.bert = BertModel(config=self.config) self.lm_head = BertOnlyMLMHead(self.config) self.lm_head.apply(self._init_weights) self.qa_head = BertOnlyMLMHead(self.config) self.qa_head.apply(self._init_weights) self.dropout = nn.Dropout(self.config.hidden_dropout_prob) self.classifier = nn.Linear(self.config.hidden_size, self.config.num_labels) self.classifier.apply(self._init_weights)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.use_crf = False self.predict_masked = False if hasattr(config, "task_specific_params"): other_config = config.task_specific_params self.num_labels_boundaries = len( other_config["labels_boundaries"]) + 1 self.classifier_boundaries = nn.Linear(config.hidden_size, self.num_labels_boundaries) if other_config["crf"]: self.use_crf = True crf_constraints = allowed_transitions( other_config["type_crf_constraints"], dict(map(reversed, config.label2id.items()))) self.crf = ConditionalRandomField(config.num_labels, constraints=crf_constraints) crf_constraints = allowed_transitions( other_config["type_crf_constraints"], dict( map(reversed, other_config["labels_boundaries"].items()))) self.crf_boundaries = ConditionalRandomField( self.num_labels_boundaries, constraints=crf_constraints) if other_config["predict_masked"]: self.predict_masked = True self.cls = BertOnlyMLMHead(config) self.init_weights()
def __init__(self, config): super().__init__(config) self.bert = MMBertModel(config) self.videomlp = VideoTokenMLP(config) # we do not use `BertGenerationOnlyLMHead` # because we can reuse pretraining. self.cls = BertOnlyMLMHead(config) self.hidden_size = config.hidden_size self.init_weights()
def __init__(self, config): super().__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.domain_cls = DomBertDomainHead(config) self.loss_fct = nn.CrossEntropyLoss() self.sim_fn = nn.CosineSimilarity(-1) self.eye = torch.eye(4680, device=0) self.init_weights()
def __init__(self, config: Config, *args, **kwargs): super().__init__(config, *args, **kwargs) # Head modules self.cls = BertOnlyMLMHead(self.config) self.vocab_size = self.config.vocab_size # Loss self.ce_loss = torch.nn.CrossEntropyLoss( ignore_index=self.config.ignore_index)
def __init__(self, config): super(UnilmForSeq2Seq, self).__init__(config) self.bert = UnilmModel(config) self.cls = BertOnlyMLMHead(config) self.crit_mask_lm = nn.CrossEntropyLoss(reduction='none') if hasattr(config, 'label_smoothing') and config.label_smoothing: self.crit_mask_lm_smoothed = LabelSmoothingLoss( config.label_smoothing, config.vocab_size, ignore_index=0, reduction='none') else: self.crit_mask_lm_smoothed = None self.init_weights() self.tie_weights()
def __init__(self, config): super(TestModel, self).__init__(config) self.bert = BertModel(config) self.input_size = config.hidden_size self.GRU_Layer = nn.GRU(input_size=self.input_size, hidden_size=self.input_size // 2, num_layers=2, bias=True, batch_first=True, dropout=config.hidden_dropout_prob, bidirectional=True) self.cls = BertOnlyMLMHead(config)
def __init__(self, config): super(SpanBertForPreTraining, self).__init__(config) self.bert = BertModel(config) # self.mlm = BertLMPredictionHead(config) self.cls = BertOnlyMLMHead(config) self.sbo = SpanBertSboHead(config) self.apply(self.init_weights) # tie the weights of input and output self.tie_weights()
def __init__(self, num_classes=1, freeze_bert=False): super().__init__() self.config = BertConfig() self.bert_layer = BertModel.from_pretrained('bert-base-uncased') self.mlm = BertOnlyMLMHead(self.config) self.cls = nn.Linear(768, num_classes) self._init_weights_bert(self.mlm) self._init_weights_bert(self.cls) # Freeze bert layers if freeze_bert: for p in self.bert_layer.parameters(): p.requires_grad = False
def __init__(self, config): super().__init__(config) if config.is_decoder: logger.warning( "If you want to use `TrelmBertForMaskedLM` make sure `config.is_decoder=False` for " "bi-directional self-attention." ) self.trelm_bert = TrelmBertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights()
def __init__(self, config): super(BertPreTrainedModel, self).__init__(config) self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.bilinear = nn.Bilinear(config.hidden_size, config.hidden_size, 1) self.loss_lambda = getattr(config, "loss_lambda", 1.) self.disable_rev_pos = getattr(config, "disable_rev_pos", False) self.padding_idx = 0 # 0 for bert models self.apply(self.init_weights) self.tie_weights()
def __init__(self, config, tokenizer): super().__init__(config) self.num_labels = config.num_labels self.mlm_probability = 0.15 self.bert = BertModel(config) self.tokenizer = tokenizer self.cls = BertOnlyMLMHead(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.topic_cls = nn.Linear(config.hidden_size, config.num_labels) self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.context_emb = nn.Parameter( torch.Tensor(config.hidden_size).normal_( mean=0.0, std=config.initializer_range)) self.activation = nn.Tanh() self.init_weights()
def __init__(self, config, bert_model=None): super(BertForQuestionAnsweringWithMaskedLM, self).__init__(config) self.num_labels = config.num_labels # self.loss_beta = args.loss_beta self.bert = BertModel(config) # qa self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) # mlm self.cls = BertOnlyMLMHead(config) # answer content self.answer_content_classifier = nn.Sequential( nn.Linear(config.hidden_size, config.hidden_size), nn.ReLU(), nn.Linear(config.hidden_size, 2)) self.init_weights()
def __init__(self, config, model_size, task=None, n_classes=None): """ The bare Bert Model transformer outputting raw hidden-states without any specific head on top. The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of cross-attention is added between the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Args: config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. model_size: Size of the Model task: MTB task n_classes: Number of classes References: Attention is all you need (https://arxiv.org/abs/1706.03762) """ super(BertModel, self).__init__(config) self.config = config self.task = task self.model_size = model_size self.embeddings = BertEmbeddings(config) self.encoder = BertEncoder(config) self.pooler = BertPooler(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.init_weights() logger.info("Model config: ", self.config) if self.task is None: self.lm_head = BertOnlyMLMHead(config) elif self.task == "classification": self.n_classes = n_classes if self.model_size == "bert-base-uncased": self.classification_layer = nn.Linear(1536, n_classes) elif self.model_size == "bert-large-uncased": self.classification_layer = nn.Linear(2048, n_classes)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) self.init_weights() # These attributes should be assigned once the model is initialized self.model_args = None self.data_args = None self.label_word_list = None # For regression self.lb = None self.ub = None # For label search. self.return_full_softmax = None
def build_heads(self): """Initialize the classifier head. It takes the output of the transformer encoder and passes it through a pooler (we use the pooler from BERT model), then dropout, BertPredictionHeadTransform (which is a linear layer, followed by activation and layer norm) and lastly a linear layer projecting the hidden output to classification labels. """ transformer_config = self.backend.get_config() if self.config.training_head_type == "classification": self.pooler = BertPooler(transformer_config) self.classifier = nn.Sequential( nn.Dropout(transformer_config.hidden_dropout_prob), BertPredictionHeadTransform(transformer_config), nn.Linear(transformer_config.hidden_size, self.config.num_labels), ) elif self.config.training_head_type == "pretraining": self.cls = BertOnlyMLMHead(transformer_config) self.vocab_size = transformer_config.vocab_size
def __init__(self, config): super().__init__(config) self.tokenizer = BertTokenizerFast("../Bert/assets/vocab.txt") self.num_labels = config.num_labels self.bert = BertModel(config) self.cls = BertOnlyMLMHead(config) # projected_emb = tf.layers.dense(output_layer, params["projection_size"]) # projected_emb = tf.keras.layers.LayerNormalization(axis=-1)(projected_emb) # if is_training: # projected_emb = tf.nn.dropout(projected_emb, rate=0.1) self.dense = nn.Linear(config.hidden_size, 128) self.LayerNorm = nn.LayerNorm(128) self.projected_emb = nn.Dropout(0.1)
def __init__(self, config): super(VaeBertMatchModelClean, self).__init__(config) self.bert = BertModel(config) # cvae返回(latent_z, output) output就是重构的x:[batch,seq,768] # lantent_z = [batch, seq*hidden] self.input_size = config.hidden_size self.dropout = config.hidden_dropout_prob self.num_layers = args.num_layers self.decoder_type = args.decoder_type self.vae_module = VaeModel(input_size=self.input_size, num_layers=self.num_layers, dropout=self.dropout, decoder_type=self.decoder_type) self.cls = BertOnlyMLMHead(config) # 加一个FFN # self.linear1 = nn.Linear(seq_len*hidden_size, seq_len*hidden_size*2) # self.linear2 = nn.Linear(seq_len*hidden_size*2, seq_len*hidden_size) self.linear3 = nn.Linear(self.input_size, 1) self.reconstruction_loss_func = nn.MSELoss() self.task_loss_func = nn.BCEWithLogitsLoss()
def __init__(self, config, args, tokenizer): super(DecoderWithLoss, self).__init__() # model components self.bert = BertModel(config) self.lm_head = BertOnlyMLMHead(config) self.span_b_proj = nn.ModuleList( [HighwayLayer(768) for _ in range(args.num_highway)]) self.span_e_proj = nn.ModuleList( [HighwayLayer(768) for _ in range(args.num_highway)]) # loss functions if args.node_label_smoothing > 0: self.lm_ce_loss = LabelSmoothingLoss( args.node_label_smoothing, config.vocab_size, ignore_index=tokenizer.pad_token_id) else: self.lm_ce_loss = torch.nn.CrossEntropyLoss( ignore_index=tokenizer.pad_token_id, reduction="none") self.span_ce_loss = torch.nn.CrossEntropyLoss(ignore_index=-1, reduction="none") self.span_loss_lb = args.lambda_span_loss
def __init__(self, name: str, labels, config, inputs=None): kwargs = { 'name': name, 'labels': labels, 'loss': LanguageModelCrossEntropyLoss(), 'per_sample_loss': ReducedPerSample(LanguageModelCrossEntropyLoss(reduction='none'), reduction=torch.mean), 'available_func': all_correct, 'inputs': inputs, 'activation': None, 'decoder': None, 'module': BertOnlyMLMHead(config), 'metrics': () } super().__init__(**kwargs)
def __init__(self, config): super().__init__(config) self.cls = BertOnlyMLMHead(config)