def from_torch(model: TorchBertModel, device: Optional[torch.device] = None): if device is not None and 'cuda' in device.type and torch.cuda.is_available( ): model.to(device) embeddings = BertEmbeddings.from_torch(model.embeddings) encoder = BertEncoder.from_torch(model.encoder) return BertModelNoPooler(embeddings, encoder)
def from_torch( model: TorchBertModel, device: Optional[torch.device] = None # from_torch函数实现 ): if device is not None and "cuda" in device.type and torch.cuda.is_available( ): model.to(device) bertmodel = turbo_transformers.BertModel.from_torch(model.bert) # We can copy the following code and do not change it # Notice: classifier is the class member of BertForSequenceClassification. If user define the other class member, # they need modify it here. return BertForSequenceClassification(bertmodel, model.classifier)
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.config = config self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.pos_weight = torch.ones(self.config.num_labels, requires_grad=False) self.init_weights()
def __init__(self, tagset_size): super(BertForSequenceClassification, self).__init__() self.tagset_size = tagset_size self.BertModel_single = BertModel.from_pretrained(pretrain_model_dir) self.single_hidden2tag = BertClassificationHead( bert_hidden_dim, tagset_size)
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo")
def from_pretrained(model_id_or_path: str, device: Optional[torch.device] = None): torch_model = TorchBertModel.from_pretrained(model_id_or_path) model = BertModelNoPooler.from_torch(torch_model, device) model.config = torch_model.config model._torch_model = torch_model # prevent destroy torch model. return model
def __init__(self, args, tokenizer: BertTokenizer, object_features_variant=False, positional_embed_variant=False, latent_transformer=False): super().__init__() self.args = args self.tokenizer = tokenizer self.image_projection = nn.Sequential( nn.Linear(512, 768), nn.BatchNorm1d(768, momentum=0.01)) config = BertConfig.from_pretrained('bert-base-uncased') self.tokenizer = tokenizer self.embeddings = BertEmbeddings(config) self.text_encoder = BertModel.from_pretrained("bert-base-uncased", return_dict=True) self.decoder = BertLMHeadModel.from_pretrained( 'bert-base-uncased', is_decoder=True, use_cache=True, add_cross_attention=True) if object_features_variant: self.image_transformer = ImageTransformerEncoder(args) self.positional_embed = True if positional_embed_variant else False self.latent_transformer = latent_transformer
def build_model(self): ''' Builds BERT NER model layers Arguments: None Returns: None ''' # set seeds if a seed was provided if self.seed: torch.manual_seed(self.seed) torch.cuda.manual_seed(self.seed) np.random.seed(self.seed) # initialize BERT model from file self.bert = BertModel(self.config).from_pretrained(self.model_file) # dropout layer for bert output self.dropout = nn.Dropout(self.config.hidden_dropout_prob) # dense classification layer self.classifier = nn.Linear(self.config.hidden_size, len(self.classes)) # CRF output layer self.crf = CRF(classes=self.classes, scheme=self.scheme, batch_first=True) # initialize CRF with seed self.crf.initialize(self.seed)
def __init__( self, vocab: Vocabulary, bert_model: Union[str, BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, **kwargs, ) -> None: super().__init__(vocab, **kwargs) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear(self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric initializer(self)
def __init__(self, config): super(Bert_CRF, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights()
def __init__(self, config, *model_args, **model_kargs): super().__init__(config) self.model_args = model_kargs["model_args"] self.bert = BertModel(config) if self.model_args.do_mlm: self.lm_head = BertLMPredictionHead(config) cl_init(self, config)
def __init__(self, config): super(BertForMultiLable, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights() self.decoder = seq_model.rnn_decoder(config) self.criterion = nn.CrossEntropyLoss(ignore_index=utils.PAD, reduction='none') self.softmax = nn.Softmax(dim=2)
def __init__(self, config, weight=None): super(BertForSequenceClassification, self).__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) self.weight = weight self.init_weights()
def __init__(self, config, head2size): super(MultiHeadModel, self).__init__(config, head2size) config.num_labels = 1 self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) module_dict = {} for head_name, num_labels in head2size.items(): module_dict[head_name] = nn.Linear(config.hidden_size, num_labels) self.heads = nn.ModuleDict(module_dict) self.init_weights()
def __init__( self, vocab: Vocabulary, bert_model: Union[str, Dict[str, Any], BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, **kwargs, ) -> None: super().__init__(vocab, **kwargs) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) elif isinstance(bert_model, dict): warnings.warn( "Initializing BertModel without pretrained weights. This is fine if you're loading " "from an AllenNLP archive, but not if you're training.", UserWarning, ) bert_config = BertConfig.from_dict(bert_model) self.bert_model = BertModel(bert_config) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear(self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric initializer(self)
def main(): if len(sys.argv) != 3: print( "Usage: \n" " convert_huggingface_bert_to_npz model_name (bert-base-uncased) output_file" ) exit(0) torch.set_grad_enabled(False) model_name = sys.argv[1] model = BertModel.from_pretrained(model_name) arrays = {k: v.detach() for k, v in model.named_parameters()} q_weight_key = 'self.query.weight' k_weight_key = 'self.key.weight' v_weight_key = 'self.value.weight' q_bias_key = 'self.query.bias' k_bias_key = 'self.key.bias' v_bias_key = 'self.value.bias' numpy_dict = {} for k in arrays.keys(): if k.endswith(q_weight_key): v = torch.clone( torch.t( torch.cat([ arrays[k], arrays[k[:-len(q_weight_key)] + k_weight_key], arrays[k[:-len(q_weight_key)] + v_weight_key] ], 0).contiguous()).contiguous()) numpy_dict[k[:-len(q_weight_key)] + "qkv.weight"] = v.numpy() elif k.endswith(q_bias_key): v = torch.cat([ arrays[k], arrays[k[:-len(q_bias_key)] + k_bias_key], arrays[k[:-len(q_bias_key)] + v_bias_key] ], 0).numpy() numpy_dict[k[:-len(q_bias_key)] + 'qkv.bias'] = v elif any((k.endswith(suffix) for suffix in (k_weight_key, v_weight_key, k_bias_key, v_bias_key))): continue elif (k.endswith("attention.output.dense.weight") or k.endswith("pooler.dense.weight") or (k.endswith("output.dense.weight") or k.endswith("intermediate.dense.weight"))): numpy_dict[k] = torch.clone(torch.t( arrays[k]).contiguous()).numpy() else: numpy_dict[k] = arrays[k].numpy() del arrays del model numpy.savez_compressed(sys.argv[2], **numpy_dict)
def __init__(self, bert_model_name, trainable=False, output_size=0, activation=gelu, dropout=0.0): """Initializes pertrained `BERT` model Arguments: bert_model_name {str} -- bert model name Keyword Arguments: output_size {float} -- output size (default: {None}) activation {nn.Module} -- activation function (default: {gelu}) dropout {float} -- dropout rate (default: {0.0}) """ super().__init__() self.bert_model = BertModel.from_pretrained(bert_model_name, output_attentions=True, output_hidden_states=True) logger.info("Load bert model {} successfully.".format(bert_model_name)) self.output_size = output_size if trainable: logger.info( "Start fine-tuning bert model {}.".format(bert_model_name)) else: logger.info("Keep fixed bert model {}.".format(bert_model_name)) for param in self.bert_model.parameters(): param.requires_grad = trainable if self.output_size > 0: self.mlp = BertLinear( input_size=self.bert_model.config.hidden_size, output_size=self.output_size, activation=activation) else: self.output_size = self.bert_model.config.hidden_size self.mlp = lambda x: x if dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = lambda x: x
def __init__(self, config): super(Bert_BiLSTM_CRF, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.linear = nn.Linear(config.hidden_size, config.num_labels) self.bilstm = nn.LSTM( config.hidden_size, (config.hidden_size) // 2, dropout=config.hidden_dropout_prob, batch_first=True, bidirectional=True, num_layers=2, ) self.crf = CRF(num_tags=config.num_labels, batch_first=True) self.init_weights()
def setup_method(self): self.monkeypatch = MonkeyPatch() # monkeypatch the PretrainedBertModel to return the tiny test fixture model config_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "config.json" vocab_path = FIXTURES_ROOT / "structured_prediction" / "srl" / "bert" / "vocab.txt" config = BertConfig.from_json_file(config_path) self.monkeypatch.setattr(BertModel, "from_pretrained", lambda _: BertModel(config)) self.monkeypatch.setattr( BertTokenizer, "from_pretrained", lambda _: BertTokenizer(vocab_path) ) super().setup_method() self.set_up_model( FIXTURES_ROOT / "structured_prediction" / "srl" / "bert_srl.jsonnet", FIXTURES_ROOT / "structured_prediction" / "srl" / "conll_2012", )
def init_bert_models(self, use_cuda: bool) -> None: self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') if not use_cuda: torch.set_num_threads(1) torch.set_grad_enabled(False) self.cfg = BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) self.torch_model = BertModel(self.cfg) self.torch_model.eval() if use_cuda: self.torch_model.to(self.test_device) self.hidden_size = self.cfg.hidden_size self.turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( self.torch_model)
def __init__( self, vocab: Vocabulary, embedding_dim: int, feedforward_dim: int, num_layers: int, num_attention_heads: int, position_embedding_dim: int, tokenizer_path: str, position_embedding_type: str = "absolute", activation: str = "gelu", hidden_dropout: float = 0.1, ) -> None: super().__init__() # TODO: # - Need to apply corrections in pretrained_transformer_mismatched_embedder tokenizer = BertTokenizer.from_pretrained(tokenizer_path) vocab.add_transformer_vocab(tokenizer, "tokens") # "tokens" is padded by default--undo that del vocab._token_to_index["tokens"]["@@PADDING@@"] del vocab._token_to_index["tokens"]["@@UNKNOWN@@"] assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"]) cfg = BertConfig( vocab_size=vocab.get_vocab_size("tokens"), hidden_size=embedding_dim, num_hidden_layers=num_layers, num_attention_heads=num_attention_heads, intermediate_size=feedforward_dim, hidden_act=activation, hidden_dropout_prob=hidden_dropout, max_position_embeddings=position_embedding_dim, position_embedding_type=position_embedding_type, use_cache=True, ) self.cfg = cfg self._vocab = vocab self._namespace = "tokens" self.bert = BertModel(cfg) self.masking_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )
def __init__(self, config, args, intent_label_lst, slot_label_lst): super(JointMBERT, self).__init__(config) self.args = args self.num_intent_labels = len(intent_label_lst) self.num_slot_labels = len(slot_label_lst) self.bert = BertModel(config=config) # Load pretrained bert self.intent_classifier = IntentClassifier(config.hidden_size, self.num_intent_labels, args.dropout_rate) self.slot_classifier = SlotClassifier( config.hidden_size, self.num_intent_labels, self.num_slot_labels, self.args.use_intent_context_concat, self.args.use_intent_context_attention, self.args.max_seq_len, self.args.intent_embedding_size, self.args.attention_embedding_size, self.args.attention_type, args.dropout_rate) if args.use_crf: self.crf = CRF(num_tags=self.num_slot_labels, batch_first=True)
def __init__(self, config): """ Init function that initializes the model. Inputs: config - Configuration of the model """ super().__init__(config) self.num_labels = config.num_labels self.num_labels_list = [config.num_labels] self.hidden_size = config.hidden_size self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) # create a list of classifiers self.classifier = nn.Linear(config.hidden_size, self.num_labels) self.classifiers = [self.classifier] self.init_weights()
def __init__(self, name_or_path_or_model: Union[str, BertModel], adapter_size: int = 64, adapter_num: int = 12, external_param: Union[bool, List[bool]] = False, **kwargs): super().__init__() if isinstance(name_or_path_or_model, str): self.bert = BertModel.from_pretrained(name_or_path_or_model) else: self.bert = name_or_path_or_model set_requires_grad(self.bert, False) if isinstance(external_param, bool): param_place = [external_param for _ in range(adapter_num)] elif isinstance(external_param, list): param_place = [False for _ in range(adapter_num)] for i, e in enumerate(external_param, 1): param_place[-i] = e else: raise ValueError("wrong type of external_param!") self.adapters = nn.ModuleList([ nn.ModuleList([ Adapter(self.bert.config.hidden_size, adapter_size, param_place[i]), Adapter(self.bert.config.hidden_size, adapter_size, param_place[i]) ]) for i in range(adapter_num) ]) for i, adapters in enumerate(self.adapters, 1): layer = self.bert.encoder.layer[-i] layer.output = AdapterBertOutput(layer.output, adapters[0].forward) set_requires_grad(layer.output.base.LayerNorm, True) layer.attention.output = AdapterBertOutput(layer.attention.output, adapters[1].forward) set_requires_grad(layer.attention.output.base.LayerNorm, True) self.output_dim = self.bert.config.hidden_size
def test_from_pytorch(self): with torch.no_grad(): with self.subTest("bert-base-cased"): tokenizer = BertTokenizerFast.from_pretrained( "bert-base-cased") fx_model = FlaxBertModel.from_pretrained("bert-base-cased") pt_model = BertModel.from_pretrained("bert-base-cased") # Check for simple input pt_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.PYTORCH) fx_inputs = tokenizer.encode_plus( "This is a simple input", return_tensors=TensorType.JAX) pt_outputs = pt_model(**pt_inputs).to_tuple() fx_outputs = fx_model(**fx_inputs) self.assertEqual( len(fx_outputs), len(pt_outputs), "Output lengths differ between Flax and PyTorch") for fx_output, pt_output in zip(fx_outputs, pt_outputs): self.assert_almost_equals(fx_output, pt_output.numpy(), 5e-3)
def test_fused_softmax(): from megatron.model.fused_softmax import FusedScaleMaskSoftmax, SoftmaxFusionTypes from megatron.model.gpt2_model import ( gpt2_attention_mask_func as attention_mask_func, ) bert = BertModel.from_pretrained("bert-base-cased").cuda().half() tokenizer = BertTokenizer.from_pretrained("bert-base-cased") test_text = ( "Hello. How are you? I am fine thank you and you? yes Good. " "hi hi hi hi hi hi hi hi hi hi hi hi hi" # 32 ) tokens = tokenizer( [test_text] * 4, return_tensors="pt", ) embedding_output = bert.embeddings( input_ids=tokens["input_ids"].cuda(), position_ids=None, token_type_ids=tokens["token_type_ids"].cuda(), inputs_embeds=None, past_key_values_length=0, ) # (bsz, 1, 1, seq_len) mask = bert.get_extended_attention_mask( attention_mask=tokens["attention_mask"].cuda(), input_shape=tokens["input_ids"].shape, device=bert.device, ) # (bsz, 1, seq_len, seq_len) mask = mask.repeat(1, 1, mask.size()[-1], 1) attention = bert.encoder.layer[0].attention.self key_layer = attention.transpose_for_scores(attention.key(embedding_output)) query_layer = attention.transpose_for_scores( attention.query(embedding_output)) attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores /= math.sqrt(key_layer.size()[-1]) fused_softmax = (FusedScaleMaskSoftmax( input_in_fp16=True, input_in_bf16=False, fusion_type=SoftmaxFusionTypes.general, mask_func=attention_mask_func, scale=None, softmax_in_fp32=False, ).cuda().half()) fused_softmax_output = fused_softmax( attention_scores, (mask != 0), ) torch_softmax = (FusedScaleMaskSoftmax( input_in_fp16=True, input_in_bf16=False, mask_func=attention_mask_func, fusion_type=SoftmaxFusionTypes.none, scale=None, softmax_in_fp32=False, ).cuda().half()) torch_softmax_output = torch_softmax( attention_scores, (mask != 0), ) test_result = (fused_softmax_output - torch_softmax_output).abs() while test_result.dim() != 1: test_result = test_result.mean(dim=-1) diff = test_result.mean(dim=-1) if diff <= 1e-3: print( f"\n[Success] test_fused_softmax" f"\n > mean_difference={diff}" f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}" f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" ) else: print( f"\n[Fail] test_fused_softmax" f"\n > mean_difference={diff}, " f"\n > fused_values={fused_softmax_output[-1][-1][-1][:5].tolist()}, " f"\n > torch_values={torch_softmax_output[-1][-1][-1][:5].tolist()}" )
class TestBertModel(unittest.TestCase): def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = BertConfig() self.torch_model = BertModel(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.BertModel.from_torch( self.torch_model, self.test_device, "turbo") def check_torch_and_turbo(self, use_cuda, batch_size, seq_len, use_memory_opt=True): self.init_data(use_cuda) num_iter = 1 device_name = "GPU" if use_cuda else "CPU" input_ids = torch.randint(low=0, high=self.cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=self.test_device) torch_model = lambda: self.torch_model(input_ids) torch_result, torch_qps, torch_time = \ test_helper.run_model(torch_model, use_cuda, num_iter) print(f'BertModel PyTorch({device_name}) QPS {torch_qps}') turbo_model = (lambda: self.turbo_model(input_ids)) if use_memory_opt: turbo_transformers.bert_opt_mem_allocate_api( input_ids.size()[0], # batch input_ids.size()[1], # seq_len self.cfg.num_attention_heads, self.cfg.hidden_size, self.cfg.num_hidden_layers, "GPU" if 'cuda' in input_ids.device.type else "CPU") with turbo_transformers.pref_guard("bert_perf") as perf: turbo_result, turbo_qps, turbo_time = \ test_helper.run_model(turbo_model, use_cuda, num_iter) print(f'BertModel TurboTransformer({device_name}) QPS {turbo_qps}') print(f"batch {batch_size} seq_len {seq_len}") print(torch.max(torch_result[0].cpu() - turbo_result[0].cpu())) self.assertTrue( numpy.allclose(torch_result[0].cpu(), turbo_result[0].cpu(), atol=1e-2, rtol=1e-3)) def bert_model_test_helper(self, use_memory_opt=False): if use_memory_opt: turbo_transformers.reset_allocator_schema("model-aware") for batch_size in [2, 4, 1]: for seq_len in [50, 4, 16]: if torch.cuda.is_available() and \ turbo_transformers.config.is_compiled_with_cuda(): self.check_torch_and_turbo(use_cuda=True, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) self.check_torch_and_turbo(use_cuda=False, batch_size=batch_size, seq_len=seq_len, use_memory_opt=use_memory_opt) if use_memory_opt: turbo_transformers.reset_allocator_schema("naive") def test_bert_model(self): # self.bert_model_test_helper(True) self.bert_model_test_helper(False)
#################### # Helper functions # #################### def process(t): return torch.stack(t).squeeze().detach().numpy() # BERT if __name__ == "__main__": excluded_neurons = {0: (0, ), 1: (0, 1), 2: (0, 1, 2)} model = BertModel.from_pretrained("bert-base-cased", output_attentions=True, output_values=True, output_dense=True, output_mlp_activations=True, output_q_activations=True, output_k_activations=True, output_v_activations=True, excluded_neurons=excluded_neurons) tokenizer = BertTokenizer.from_pretrained("bert-base-cased") inputs = tokenizer("Hello", return_tensors="pt") print("### inputs ###") print(inputs.items()) outputs = model(**inputs) print("### values ###") print(len(outputs["values"])) print(outputs["values"][0].shape) values = outputs["values"] values = torch.stack(values).squeeze() values = values.detach().numpy()
def main(config): args = config if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ATEPCProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 datasets = { 'camera': "atepc_datasets/camera", 'car': "atepc_datasets/car", 'phone': "atepc_datasets/phone", 'notebook': "atepc_datasets/notebook", 'laptop': "atepc_datasets/laptop", 'restaurant': "atepc_datasets/restaurant", 'twitter': "atepc_datasets/twitter", 'mixed': "atepc_datasets/mixed", } pretrained_bert_models = { 'camera': "bert-base-chinese", 'car': "bert-base-chinese", 'phone': "bert-base-chinese", 'notebook': "bert-base-chinese", 'laptop': "bert-base-uncased", 'restaurant': "bert-base-uncased", # for loading domain-adapted BERT # 'restaurant': "../bert_pretrained_restaurant", 'twitter': "bert-base-uncased", 'mixed': "bert-base-multilingual-uncased", } args.bert_model = pretrained_bert_models[args.dataset] args.data_dir = datasets[args.dataset] def convert_polarity(examples): for i in range(len(examples)): polarities = [] for polarity in examples[i].polarity: if polarity == 2: polarities.append(1) else: polarities.append(polarity) examples[i].polarity = polarities tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs bert_base_model = BertModel.from_pretrained(args.bert_model) bert_base_model.config.num_labels = num_labels if args.dataset in {'camera', 'car', 'phone', 'notebook'}: convert_polarity(train_examples) convert_polarity(eval_examples) model = LCF_ATEPC(bert_base_model, args=args) else: model = LCF_ATEPC(bert_base_model, args=args) for arg in vars(args): logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg))) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=0.00001) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0} ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) if eval_APC: polarities = model.get_batch_polarities(polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) if eval_ATE: if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids) ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map.get(label_ids[i][j], 'O')) temp_2.append(label_map.get(ate_logits[i][j], 'O')) if eval_APC: test_acc = n_test_correct / n_test_total if args.dataset in {'camera', 'car', 'phone', 'notebook'}: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') else: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1, 2], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = { 'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1 } if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result def save_model(path): # Save a trained model and the associated configuration, # Take care of the storage! os.makedirs(path, exist_ok=True) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(path) tokenizer.save_pretrained(path) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": True, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump(model_config, open(os.path.join(path, "config.json"), "w")) logger.info('save model to: {}'.format(path)) def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2: # evaluate in last 2 epochs apc_result, ate_result = evaluate( eval_ATE=not args.use_bert_spc) # apc_result, ate_result = evaluate() # path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format( # args.output_dir, # args.dataset, # args.local_context_focus, # round(apc_result['max_apc_test_acc'], 2), # round(apc_result['max_apc_test_f1'], 2), # round(ate_result, 2) # ) # if apc_result['max_apc_test_acc'] > max_apc_test_acc or \ # apc_result['max_apc_test_f1'] > max_apc_test_f1 or \ # ate_result > max_ate_test_f1: # save_model(path) if apc_result['max_apc_test_acc'] > max_apc_test_acc: max_apc_test_acc = apc_result['max_apc_test_acc'] if apc_result['max_apc_test_f1'] > max_apc_test_f1: max_apc_test_f1 = apc_result['max_apc_test_f1'] if ate_result > max_ate_test_f1: max_ate_test_f1 = ate_result current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc}) ' f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})' ) if args.use_bert_spc: logger.info( f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})' f' (Unreliable since `use_bert_spc` is "True".)' ) else: logger.info( f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1] return train()
def __init__(self, config, weight=None): super(BertForTextRepresentation, self).__init__(config) self.bert = BertModel(config) self.weight = weight self.init_weights()