def from_torch(model: TorchRobertaModel, device: Optional[torch.device] = None): if device is not None and 'cuda' in device.type and torch.cuda.is_available( ): model.to(device) encoder = BertEncoder.from_torch(model.encoder) pooler = BertPooler.from_torch(model.pooler) return RobertaModel(model.embeddings, encoder, pooler, model.config)
class TestRobertaModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = RobertaConfig() self.torch_model = RobertaModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.RobertaModel.from_torch( self.torch_model, self.test_device) def check_torch_and_turbo(self, use_cuda): self.init_data(use_cuda) num_iter = 20 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'RobertaModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) with turbo_transformers.pref_guard("roberta_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'RobertaModel TurboTransformer({device_name}) QPS {turbo_qps}') torch_result_final = torch_result[0].cpu().numpy() turbo_result_final = turbo_result[0].cpu().numpy() # print(numpy.size(torch_result_final), numpy.size(turbo_result_final)) # print(torch_result_final - turbo_result_final) self.assertTrue( numpy.allclose(torch_result_final, turbo_result_final, atol=1e-3, rtol=1e-3)) def test_Roberta_model(self): if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True) self.check_torch_and_turbo(use_cuda=False)
def __init__(self, config, num_labels_list, layer=-1, freeze=False, tokens=False, tagger=False, relations=False, num_attention_heads=12, class_weights=None): super().__init__(config) self.num_labels = num_labels_list self.roberta = RobertaModel(config) if freeze: for param in self.roberta.parameters(): param.requires_grad = False self.feature_extractors = nn.ModuleList() self.classifiers = nn.ModuleList() for task_ind, task_num_labels in enumerate(num_labels_list): self.feature_extractors.append( RepresentationProjectionLayer( config, layer=layer, tokens=tokens, tagger=tagger[task_ind], relations=relations[task_ind], num_attention_heads=num_attention_heads)) if relations[task_ind]: self.classifiers.append( ClassificationHead(config, task_num_labels, hidden_size=num_attention_heads)) else: self.classifiers.append( ClassificationHead(config, task_num_labels)) # Are we operating as a sequence classifier (1 label per input sequence) or a tagger (1 label per input token in the sequence) self.tagger = tagger self.relations = relations if class_weights is None: self.class_weights = [None] * len(self.classifiers) else: self.class_weights = class_weights self.init_weights()
def __init__(self, config, weight=None): super(RobertaForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config) self.classifier = RobertaClassificationHead(config) self.weight = weight
def test_from_pytorch(self): with torch.no_grad(): with self.subTest("roberta-base"): tokenizer = RobertaTokenizerFast.from_pretrained( "roberta-base") fx_model = FlaxRobertaModel.from_pretrained("roberta-base") pt_model = RobertaModel.from_pretrained("roberta-base") # Check for simple input pt_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.PYTORCH) fx_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.JAX) pt_outputs = pt_model(**pt_inputs) fx_outputs = fx_model(**fx_inputs) self.assertEqual( len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs, pt_outputs.to_tuple()): self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-3)
def __init__(self, tagset_size): super(RobertaForSequenceClassification, self).__init__() self.tagset_size = tagset_size self.roberta_single = RobertaModel.from_pretrained(pretrain_model_dir) self.single_hidden2tag = RobertaClassificationHead( bert_hidden_dim, tagset_size)
def __init__(self, config, finetune, list_labels=[], use_bilstms=False): #config2 = config #config2.num_labels = 2 super(MTLRobertaForTokenClassification, self).__init__(config) self.num_labels = list_labels self.num_tasks = len(self.num_labels) self.roberta = RobertaModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.use_bilstms = use_bilstms self.lstm_size = 400 self.lstm_layers = 2 self.bidirectional_lstm = True if self.use_bilstms: self.lstm = nn.LSTM(config.hidden_size, self.lstm_size, num_layers=self.lstm_layers, batch_first=True, bidirectional=self.bidirectional_lstm) self.hidden2tagList = nn.ModuleList([ nn.Linear( self.lstm_size * (2 if self.bidirectional_lstm else 1), self.num_labels[idtask]) for idtask in range(self.num_tasks) ]) else: self.hidden2tagList = nn.ModuleList([ nn.Linear(config.hidden_size, self.num_labels[idtask]) for idtask in range(self.num_tasks) ]) self.finetune = finetune self.init_weights()
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = RobertaConfig() self.torch_model = RobertaModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.RobertaModel.from_torch( self.torch_model, self.test_device)
def __init__(self, config, *model_args, **model_kargs): super().__init__(config) self.model_args = model_kargs["model_args"] self.roberta = RobertaModel(config) if self.model_args.do_mlm: self.lm_head = RobertaLMHead(config) cl_init(self, config)
def __init__(self, config, dropout=0.1): super().__init__(config) self.num_labels = config.num_labels self.config = config self.roberta = RobertaModel(config) self.dropout = nn.Dropout(dropout) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.sub_num = [1] self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config, add_pooling_layer=False) config_copy = deepcopy(config) setattr(config_copy, "new_hidden_size", config.hidden_size + self.num_size) self.classifier = RobertaClassificationHead(config_copy) self.init_weights()
def __init__(self, config): super(PhoBertQueryNER, self).__init__(config) self.roberta = RobertaModel(config) # self.start_outputs = nn.Linear(config.hidden_size, 2) # self.end_outputs = nn.Linear(config.hidden_size, 2) self.start_outputs = nn.Linear(config.hidden_size, 1) self.end_outputs = nn.Linear(config.hidden_size, 1) self.span_embedding = MultiNonLinearClassifier(config.hidden_size * 2, 1, config.mrc_dropout) # self.span_embedding = SingleLinearClassifier(config.hidden_size * 2, 1) self.hidden_size = config.hidden_size self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.roberta = RobertaModel(config, add_pooling_layer=False) #self.dropout = nn.Dropout(config.hidden_dropout_prob) lstm_layer = 1 self.lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size, num_layers=lstm_layer, dropout=config.hidden_dropout_prob, bidirectional=True) self.classifier = nn.Linear(config.hidden_size * 2, config.num_labels) self.init_weights()
def __init__(self, config, args, intent_label_lst, slot_label_lst): super(JointRoberta, self).__init__(config) self.args = args self.num_intent_labels = len(intent_label_lst) self.num_slot_labels = len(slot_label_lst) self.roberta = RobertaModel(config=config) # Load pretrained bert self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate) self.slot_classifier = SlotClassifier( config.hidden_size, self.num_intent_labels, self.num_slot_labels, self.args.use_intent_context_concat, self.args.use_intent_context_attention, self.args.max_seq_len, self.args.intent_embedding_size, self.args.attention_embedding_size, self.args.attention_type, args.dropout_rate) if args.use_crf: self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
def test(use_cuda): torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = RobertaConfig() torch_model = RobertaModel(cfg) torch_model.eval() if torch.cuda.is_available(): torch_model.to(test_device) turbo_model = turbo_transformers.RobertaModel.from_torch( torch_model, test_device) input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, 10), dtype=torch.long, device=test_device) torch_result = torch_model(input_ids) torch_result_final = torch_result[0].cpu().numpy() turbo_result = turbo_model(input_ids) turbo_result_final = turbo_result[0].cpu().numpy() # See the differences # print(numpy.size(torch_result_final), numpy.size(turbo_result_final)) print(torch_result_final - turbo_result_final) assert (numpy.allclose(torch_result_final, turbo_result_final, atol=1e-3, rtol=1e-3))
class CnlpRobertaForClassification(RobertaPreTrainedModel): config_class = RobertaConfig base_model_prefix = "roberta" def __init__(self, config, num_labels_list, layer=-1, freeze=False, tokens=False, tagger=False, relations=False, num_attention_heads=12, class_weights=None): super().__init__(config) self.num_labels = num_labels_list self.roberta = RobertaModel(config) if freeze: for param in self.roberta.parameters(): param.requires_grad = False self.feature_extractors = nn.ModuleList() self.classifiers = nn.ModuleList() for task_ind, task_num_labels in enumerate(num_labels_list): self.feature_extractors.append( RepresentationProjectionLayer( config, layer=layer, tokens=tokens, tagger=tagger[task_ind], relations=relations[task_ind], num_attention_heads=num_attention_heads)) if relations[task_ind]: self.classifiers.append( ClassificationHead(config, task_num_labels, hidden_size=num_attention_heads)) else: self.classifiers.append( ClassificationHead(config, task_num_labels)) # Are we operating as a sequence classifier (1 label per input sequence) or a tagger (1 label per input token in the sequence) self.tagger = tagger self.relations = relations if class_weights is None: self.class_weights = [None] * len(self.classifiers) else: self.class_weights = class_weights self.init_weights() def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, event_tokens=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ outputs = self.roberta(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=True, return_dict=True) batch_size, seq_len = input_ids.shape logits = [] loss = None task_label_ind = 0 for task_ind, task_num_labels in enumerate(self.num_labels): features = self.feature_extractors[task_ind](outputs.hidden_states, event_tokens) task_logits = self.classifiers[task_ind](features) logits.append(task_logits) if labels is not None: if task_num_labels == 1: # We are doing regression loss_fct = MSELoss() task_loss = loss_fct(task_logits.view(-1), labels.view(-1)) else: if not self.class_weights[task_ind] is None: class_weights = torch.FloatTensor( self.class_weights[task_ind]).to(self.device) else: class_weights = None loss_fct = CrossEntropyLoss(weight=class_weights) if self.relations[task_ind]: task_labels = labels[:, 0, task_label_ind:task_label_ind + seq_len, :] task_label_ind += seq_len task_loss = loss_fct( task_logits.permute(0, 3, 1, 2), task_labels.type(torch.LongTensor).to( labels.device)) else: task_labels = labels[:, 0, task_label_ind, :] task_label_ind += 1 task_loss = loss_fct( task_logits.view(-1, task_num_labels), task_labels.reshape([ batch_size * seq_len, ]).type(torch.LongTensor).to(labels.device)) if loss is None: loss = task_loss else: loss += task_loss # if not return_dict: # output = (logits,) + outputs[2:] # return ((loss,) + output) if loss is not None else output if self.training: return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) else: return SequenceClassifierOutput(loss=loss, logits=logits)
import torch from transformers.models.roberta.modeling_roberta import RobertaModel from lambert.model import LambertModel BATCH_SIZE = 4 SEQUENCE_LENGTH = 32 roberta = RobertaModel.from_pretrained('roberta-base') lambert = LambertModel(roberta) input_ids = torch.randint(0, 100, (BATCH_SIZE, SEQUENCE_LENGTH)) bboxes = torch.rand((BATCH_SIZE, SEQUENCE_LENGTH, 4)) lambert_output = lambert(input_ids=input_ids, bboxes=bboxes) lambert_encoding = lambert_output.last_hidden_state assert lambert_encoding.shape == (BATCH_SIZE, SEQUENCE_LENGTH, roberta.config.hidden_size) assert lambert_encoding.dtype == torch.float