def __init__(self, params): super(BiEncoderModule, self).__init__() ctxt_bert = BertModel.from_pretrained(params["bert_model"], output_hidden_states=True) if params["load_cand_enc_only"]: bert_model = "bert-large-uncased" else: bert_model = params['bert_model'] cand_bert = BertModel.from_pretrained( bert_model, output_hidden_states=True, ) self.context_encoder = BertEncoder( ctxt_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.cand_encoder = BertEncoder( cand_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) if params.get("freeze_cand_enc", False): for param in self.cand_encoder.parameters(): param.requires_grad = False self.config = ctxt_bert.config ctxt_bert_output_dim = ctxt_bert.embeddings.word_embeddings.weight.size( 1) self.mention_aggregation_type = params.get('mention_aggregation_type', None) self.classification_heads = nn.ModuleDict({}) self.linear_compression = None if self.mention_aggregation_type is not None: classification_heads_dict = { 'get_context_embeds': GetContextEmbedsHead( self.mention_aggregation_type, ctxt_bert_output_dim, cand_bert.embeddings.word_embeddings.weight.size(1), ) } classification_heads_dict['mention_scores'] = MentionScoresHead( ctxt_bert_output_dim, params["mention_scoring_method"], params.get("max_mention_length", 10), ) self.classification_heads = nn.ModuleDict( classification_heads_dict) elif ctxt_bert_output_dim != cand_bert.embeddings.word_embeddings.weight.size( 1): # mapping to make the output dimensions match for dot-product similarity self.linear_compression = nn.Linear( ctxt_bert_output_dim, cand_bert.embeddings.word_embeddings.weight.size(1))
def main(raw_args=None): parser = argparse.ArgumentParser() parser.add_argument("--model_name", type=str, required=True, help="model name e.g. bert-base-uncased") parser.add_argument("--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model") parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/<pytorch-model-name>.bin") parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") args = parser.parse_args(raw_args) model = BertModel.from_pretrained( pretrained_model_name_or_path=args.model_name, state_dict=torch.load(args.pytorch_model_path), cache_dir=args.cache_dir, args=args) convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name)
def bertModel(*args, **kwargs): """ BertModel is the basic BERT Transformer model with a layer of summed token, position and sequence embeddings followed by a series of identical self-attention blocks (12 for BERT-base, 24 for BERT-large). Example: # Load the tokenizer >>> import torch >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False) # Prepare tokenized input >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]" >>> tokenized_text = tokenizer.tokenize(text) >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] >>> tokens_tensor = torch.tensor([indexed_tokens]) >>> segments_tensors = torch.tensor([segments_ids]) # Load bertModel >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased') >>> model.eval() # Predict hidden states features for each layer >>> with torch.no_grad(): encoded_layers, _ = model(tokens_tensor, segments_tensors) """ model = BertModel.from_pretrained(*args, **kwargs) return model
def __init__(self, params, tokenizer, start_mention_id=None, end_mention_id=None): super(CrossEncoderModule, self).__init__() model_path = params["bert_model"] if params.get("roberta"): encoder_model = RobertaModel.from_pretrained(model_path) else: encoder_model = BertModel.from_pretrained(model_path) encoder_model.resize_token_embeddings(len(tokenizer)) self.pool_highlighted = params["pool_highlighted"] self.encoder = BertEncoder(encoder_model, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"] and not self.pool_highlighted, get_all_outputs=self.pool_highlighted) self.config = self.encoder.bert_model.config self.start_mention_id = start_mention_id self.end_mention_id = end_mention_id if self.pool_highlighted: bert_output_dim = encoder_model.embeddings.word_embeddings.weight.size( 1) output_dim = params["out_dim"] self.additional_linear = nn.Linear(2 * bert_output_dim, output_dim) self.dropout = nn.Dropout(0.1)
def __init__(self, params): super(BiEncoderModule, self).__init__() ctxt_bert = BertModel.from_pretrained(params["bert_model"]) cand_bert = BertModel.from_pretrained(params['bert_model']) self.context_encoder = BertEncoder( ctxt_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.cand_encoder = BertEncoder( cand_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.config = ctxt_bert.config
def load(cls, model_name: str, cache_model: bool = True) -> BertModel: if model_name in cls._cache: return PretrainedBertModel._cache[model_name] model = BertModel.from_pretrained(model_name) if cache_model: cls._cache[model_name] = model return model
def test_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: config = BertConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, PretrainedConfig) model = BertModel.from_pretrained(model_name) model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, PreTrainedModel) for value in loading_info.values(): self.assertEqual(len(value), 0) config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) self.assertEqual(model.config, config)
def __init__(self, params): super(BiEncoderModule, self).__init__() ctxt_bert = BertModel.from_pretrained( params["bert_model"] ) # Could be a path containing config.json and pytorch_model.bin; or could be an id shorthand for a model that is loaded in the library cand_bert = BertModel.from_pretrained(params["bert_model"]) self.context_encoder = BertEncoder( ctxt_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.cand_encoder = BertEncoder( cand_bert, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.config = ctxt_bert.config
def __init__(self, bert_model_name, cache_dir="./cache/"): super().__init__() # Create cache directory if not exists if not os.path.exists(cache_dir): os.makedirs(cache_dir) self.bert_model = BertModel.from_pretrained(bert_model_name, cache_dir=cache_dir, output_hidden_states=True) self.bert_model.train()
def load(cls, pretrained_model_name_or_path, language=None): bert = cls() # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = os.path.join(pretrained_model_name_or_path, "language_model_config.json") if os.path.exists(farm_lm_config): # FARM style bert_config = BertConfig.from_pretrained(farm_lm_config) farm_lm_model = os.path.join(pretrained_model_name_or_path, "language_model.bin") bert.model = BertModel.from_pretrained(farm_lm_model, config=bert_config) bert.language = bert.model.config.language else: # Pytorch-transformer Style bert.model = BertModel.from_pretrained( pretrained_model_name_or_path) bert.language = cls._infer_language_from_name( pretrained_model_name_or_path) return bert
def __init__(self, model_name_or_path, hidden_size=768, num_class=2): super(NeuralNet, self).__init__() self.config = BertConfig.from_pretrained(model_name_or_path, num_labels=4) self.config.output_hidden_states = True self.bert = BertModel.from_pretrained(model_name_or_path, config=self.config) for param in self.bert.parameters(): param.requires_grad = True self.weights = torch.rand(13, 1).cuda() self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)]) self.fc = nn.Linear(hidden_size, num_class)
def __init__(self, params, tokenizer): super(CrossEncoderModule, self).__init__() model_path = params["bert_model"] if params.get("roberta"): encoder_model = RobertaModel.from_pretrained(model_path) else: encoder_model = BertModel.from_pretrained(model_path) encoder_model.resize_token_embeddings(len(tokenizer)) self.encoder = BertEncoder( encoder_model, params["out_dim"], layer_pulled=params["pull_from_layer"], add_linear=params["add_linear"], ) self.config = self.encoder.bert_model.config
def __init__(self, args, config): self.config = config self.config_model = config['model'] self.args = args self.bert_node_encoder = Transformer_xh.from_pretrained( self.config['bert_model_file'], cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) self.bert_node_encoder.encoder.build_model(args.hops) if args.arch == 'bert': self.bert_node_encoder = BertModel.from_pretrained( self.config['bert_model_file'], cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) self.bert_config = self.bert_node_encoder.config self.network = ModelHelper(self.bert_node_encoder, self.args, self.bert_config, self.config_model) self.device = args.device
def __init__(self, model_path): super(OnmtBertEncoder, self).__init__() config = BertConfig.from_json_file( os.path.join(model_path, "config.json")) pretrained_dict = os.path.join(model_path, "pytorch_model.bin") if os.path.exists(pretrained_dict): model = BertModel.from_pretrained( pretrained_model_name_or_path=pretrained_dict, config=config) print("init BERT model with {} weights".format( len(model.state_dict()))) else: model = BertModel(config) model.embeddings.word_embeddings = expandEmbeddingByN( model.embeddings.word_embeddings, 4) model.embeddings.word_embeddings = expandEmbeddingByN( model.embeddings.word_embeddings, 2, last=True) self.encoder = model #print(model) print("***" * 20)
def __init__( self, config='a class with num_attention_heads, hidden_size, attention_probs_dropout_prob, output_attentions', bert_dir='/mnt/sda1/bert/uncased_L-12_H-768_A-12', drop=0.0, L=80, bert_dim=768, num_class=3, SDR=5, tp='cdm'): super(MY_BERT_LCF, self).__init__() self.text_bert = BertModel.from_pretrained(bert_dir) self.aspect_bert = copy.deepcopy(self.text_bert) self.aspect_self_att = SelfAttention(config, L) self.bert_pooler = BertPooler(config) if tp == 'cdm': self.reduce2_bert_dim = nn.Linear(bert_dim * 2, bert_dim) self.reduce2_num_class_linear = nn.Linear(bert_dim, num_class) self.drop = drop self.L = L self.SDR = SDR self.tp = tp
class Words2VectorNet(superclass): def __init__(self, parameters, word_embeddings=None): if use_bert: super(Words2VectorNet, self).__init__(config) self.bert = BertModel(config) self.bert.from_pretrained('{}-uncased'.format('bert-base')) #if torch.cuda.device_count() > 1: # self.bert = torch.nn.DataParallel(self.bert) self._p = parameters self._dropout = nn.Dropout(p=self._p.get('dropout', '0.1')) self._word_embedding = nn.Embedding(self._p['word.vocab.size'], self._p['word.emb.size'], padding_idx=0) if word_embeddings is not None: word_embeddings = torch.from_numpy(word_embeddings).float() self._word_embedding.weight = nn.Parameter(word_embeddings) self._word_embedding.weight.requires_grad = False self._nonlinearity = nn.ReLU() if self._p.get( 'enc.activation', 'tanh') == 'relu' else nn.Tanh() out_size = 768 else: super(Words2VectorNet, self).__init__() self._p = parameters self._dropout = nn.Dropout(p=self._p.get('dropout', '0.1')) self._word_embedding = nn.Embedding(self._p['word.vocab.size'], self._p['word.emb.size'], padding_idx=0) if word_embeddings is not None: word_embeddings = torch.from_numpy(word_embeddings).float() self._word_embedding.weight = nn.Parameter(word_embeddings) self._word_embedding.weight.requires_grad = False self._pos_embedding = nn.Embedding(3, self._p['poss.emb.size'], padding_idx=0) self._word_encoding_conv = nn.Conv1d( self._p['word.emb.size'] + self._p['poss.emb.size'], self._p['word.conv.size'], self._p['word.conv.width'], padding=self._p['word.conv.width'] // 2) self._nonlinearity = nn.ReLU() if self._p.get( 'enc.activation', 'tanh') == 'relu' else nn.Tanh() self._convs = nn.ModuleList([ nn.Sequential( nn.Conv1d( in_channels=self._p['word.conv.size'], out_channels=self._p['word.conv.size'], kernel_size=self._p['word.conv.width'], padding=self._p['char.conv.width'] // 2 * 2**(j + 1) if not self._p.get("legacy.mode", False) else self._p['char.conv.width'] // 2 + 2**(j + 1), dilation=2**(j + 1), bias=True), self._nonlinearity) for j in range(self._p.get('word.conv.depth', 1)) ]) self._block_conv = nn.Conv1d(self._p['word.conv.size'], self._p['word.conv.size'], self._p['word.conv.width'], padding=self._p['word.conv.width'] // 2) out_size = self._p['word.conv.size'] self.sem_layers = nn.Sequential( self._dropout, nn.Linear(out_size, self._p['word.enc.size']), self._nonlinearity, ) self._pool = nn.AdaptiveMaxPool1d(1) if self._p.get( 'enc.pooling', 'max') == 'max' else nn.AdaptiveAvgPool1d(1) def forward(self, sent_m_with_pos): if use_bert: # non-zero tokens are non-masked sent_m_with_pos = sent_m_with_pos.long() sent_m = self.bert(input_ids=sent_m_with_pos, attention_mask=(sent_m_with_pos != 0)) sent_m = sent_m[0][:, 0, :] else: sent_m_with_pos = sent_m_with_pos.long() sent_m = sent_m_with_pos[..., 0] positions = sent_m_with_pos[..., 1] sent_m = self._word_embedding(sent_m) positions = self._pos_embedding(positions) sent_m = torch.cat((sent_m, positions), dim=-1).transpose(-2, -1).contiguous() sent_m = self._dropout(sent_m) sent_m = self._word_encoding_conv(sent_m) sent_m = self._nonlinearity(sent_m) for _ in range(self._p.get("word.repeat.convs", 3)): for convlayer in self._convs: sent_m = convlayer(sent_m) sent_m = self._block_conv(sent_m) sent_m = self._nonlinearity(sent_m) sent_m = self._pool(sent_m).squeeze(dim=-1) sent_m = self.sem_layers(sent_m) return sent_m
"scorer": Scorer(custom_metric_funcs={"Tag_accuracy": tag_accuracy_scorer}) } } # Get the absolute current working directory of the project cwd = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) # Create empty list to hold every Dataloader object dataloaders = [] # Create empty list to hold every Task object tasks = [] # Define the shared BERT layer to be used across tasks, and set the max seq length for the model. shared_BERT_model = BertModel.from_pretrained('bert-base-uncased') shared_BERT_model.config.max_position_embeddings = MAX_SEQ_LENGTH # Confirm BERTs hidden layer size hidden_layer_size = 768 # Make a module to contain the BERT module but can take the inputs of the Xs bert_module = SnorkelFriendlyBert(bert_model=shared_BERT_model) # Iterate through all task types for task_type in ["Classification_Tasks", "Tagging_Tasks"]: # Get the contents of the data folder for the given task type target_data_path = os.path.join(cwd, "data", task_type) # Get names of all datasets in data folder
def __init__(self, bert_model=BertModel.from_pretrained('bert-base-uncased')): super(SnorkelFriendlyBert, self).__init__() self.bert_layer = bert_model use_cuda = torch.cuda.is_available() self.device = torch.device('cuda:0' if use_cuda else 'cpu')
def init_model(self): logger.info(f'loading pretrain model from {self.pretrain_model}') model = BertModel.from_pretrained(self.pretrain_model) model.to(self.device) return model
def main(config): args = config if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = ATEPCProcessor() label_list = processor.get_labels() num_labels = len(label_list) + 1 datasets = { 'camera': "atepc_datasets/camera", 'car': "atepc_datasets/car", 'phone': "atepc_datasets/phone", 'notebook': "atepc_datasets/notebook", 'laptop': "atepc_datasets/laptop", 'restaurant': "atepc_datasets/restaurant", 'twitter': "atepc_datasets/twitter", 'mixed': "atepc_datasets/mixed", } pretrained_bert_models = { 'camera': "bert-base-chinese", 'car': "bert-base-chinese", 'phone': "bert-base-chinese", 'notebook': "bert-base-chinese", 'laptop': "bert-base-uncased", 'restaurant': "bert-base-uncased", 'twitter': "bert-base-uncased", 'mixed': "bert-base-multilingual-uncased", } args.bert_model = pretrained_bert_models[args.dataset] args.data_dir = datasets[args.dataset] def convert_polarity(examples): for i in range(len(examples)): polarities = [] for polarity in examples[i].polarity: if polarity == 2: polarities.append(1) else: polarities.append(polarity) examples[i].polarity = polarities tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) train_examples = processor.get_train_examples(args.data_dir) eval_examples = processor.get_test_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs bert_base_model = BertModel.from_pretrained(args.bert_model) bert_base_model.config.num_labels = num_labels if args.dataset in {'camera', 'car', 'phone', 'notebook'}: convert_polarity(train_examples) convert_polarity(eval_examples) model = LCF_ATEPC(bert_base_model, args=args) else: model = LCF_ATEPC(bert_base_model, args=args) for arg in vars(args): logger.info('>>> {0}: {1}'.format(arg, getattr(args, arg))) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.00001 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, weight_decay=0.00001) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_spc_input_ids = torch.tensor([f.input_ids_spc for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in eval_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in eval_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) # Run prediction for full data eval_sampler = RandomSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) def evaluate(eval_ATE=True, eval_APC=True): # evaluate apc_result = {'max_apc_test_acc': 0, 'max_apc_test_f1': 0} ate_result = 0 y_true = [] y_pred = [] n_test_correct, n_test_total = 0, 0 test_apc_logits_all, test_polarities_all = None, None model.eval() label_map = {i: label for i, label in enumerate(label_list, 1)} for input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask in eval_dataloader: input_ids_spc = input_ids_spc.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) valid_ids = valid_ids.to(device) label_ids = label_ids.to(device) polarities = polarities.to(device) l_mask = l_mask.to(device) with torch.no_grad(): ate_logits, apc_logits = model(input_ids_spc, segment_ids, input_mask, valid_ids=valid_ids, polarities=polarities, attention_mask_label=l_mask) if eval_APC: polarities = model.get_batch_polarities(polarities) n_test_correct += (torch.argmax( apc_logits, -1) == polarities).sum().item() n_test_total += len(polarities) if test_polarities_all is None: test_polarities_all = polarities test_apc_logits_all = apc_logits else: test_polarities_all = torch.cat( (test_polarities_all, polarities), dim=0) test_apc_logits_all = torch.cat( (test_apc_logits_all, apc_logits), dim=0) if eval_ATE: if not args.use_bert_spc: label_ids = model.get_batch_token_labels_bert_base_indices( label_ids) ate_logits = torch.argmax(F.log_softmax(ate_logits, dim=2), dim=2) ate_logits = ate_logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() for i, label in enumerate(label_ids): temp_1 = [] temp_2 = [] for j, m in enumerate(label): if j == 0: continue elif label_ids[i][j] == len(label_list): y_true.append(temp_1) y_pred.append(temp_2) break else: temp_1.append(label_map.get(label_ids[i][j], 'O')) temp_2.append(label_map.get(ate_logits[i][j], 'O')) if eval_APC: test_acc = n_test_correct / n_test_total if args.dataset in {'camera', 'car', 'phone', 'notebook'}: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1], average='macro') else: test_f1 = f1_score(torch.argmax(test_apc_logits_all, -1).cpu(), test_polarities_all.cpu(), labels=[0, 1, 2], average='macro') test_acc = round(test_acc * 100, 2) test_f1 = round(test_f1 * 100, 2) apc_result = { 'max_apc_test_acc': test_acc, 'max_apc_test_f1': test_f1 } if eval_ATE: report = classification_report(y_true, y_pred, digits=4) tmps = report.split() ate_result = round(float(tmps[7]) * 100, 2) return apc_result, ate_result def save_model(path): # Save a trained model and the associated configuration, # Take care of the storage! os.makedirs(path, exist_ok=True) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(path) tokenizer.save_pretrained(path) label_map = {i: label for i, label in enumerate(label_list, 1)} model_config = { "bert_model": args.bert_model, "do_lower": True, "max_seq_length": args.max_seq_length, "num_labels": len(label_list) + 1, "label_map": label_map } json.dump(model_config, open(os.path.join(path, "config.json"), "w")) logger.info('save model to: {}'.format(path)) def train(): train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_spc_input_ids = torch.tensor( [f.input_ids_spc for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) all_valid_ids = torch.tensor([f.valid_ids for f in train_features], dtype=torch.long) all_lmask_ids = torch.tensor([f.label_mask for f in train_features], dtype=torch.long) all_polarities = torch.tensor([f.polarities for f in train_features], dtype=torch.long) train_data = TensorDataset(all_spc_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_polarities, all_valid_ids, all_lmask_ids) train_sampler = SequentialSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) max_apc_test_acc = 0 max_apc_test_f1 = 0 max_ate_test_f1 = 0 global_step = 0 for epoch in range(int(args.num_train_epochs)): logger.info('#' * 80) logger.info('Train {} Epoch{}'.format(args.seed, epoch + 1, args.data_dir)) logger.info('#' * 80) nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.to(device) for t in batch) input_ids_spc, input_mask, segment_ids, label_ids, polarities, valid_ids, l_mask = batch loss_ate, loss_apc = model(input_ids_spc, segment_ids, input_mask, label_ids, polarities, valid_ids, l_mask) loss = loss_ate + loss_apc loss.backward() nb_tr_examples += input_ids_spc.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.eval_steps == 0: if epoch >= args.num_train_epochs - 2 or args.num_train_epochs <= 2: # evaluate only in last 2 epochs apc_result, ate_result = evaluate( eval_ATE=not args.use_bert_spc) # apc_result, ate_result = evaluate() path = '{0}/{1}_{2}_apcacc_{3}_apcf1_{4}_atef1_{5}'.format( args.output_dir, args.dataset, args.local_context_focus, round(apc_result['max_apc_test_acc'], 2), round(apc_result['max_apc_test_f1'], 2), round(ate_result, 2)) if apc_result['max_apc_test_acc'] > max_apc_test_acc: max_apc_test_acc = apc_result['max_apc_test_acc'] if apc_result['max_apc_test_f1'] > max_apc_test_f1: max_apc_test_f1 = apc_result['max_apc_test_f1'] if ate_result > max_ate_test_f1: max_ate_test_f1 = ate_result if apc_result['max_apc_test_acc'] > max_apc_test_acc or \ apc_result['max_apc_test_f1'] > max_apc_test_f1 or \ ate_result > max_ate_test_f1: save_model(path) current_apc_test_acc = apc_result['max_apc_test_acc'] current_apc_test_f1 = apc_result['max_apc_test_f1'] current_ate_test_f1 = round(ate_result, 2) logger.info('*' * 80) logger.info('Train {} Epoch{}, Evaluate for {}'.format( args.seed, epoch + 1, args.data_dir)) logger.info( f'APC_test_acc: {current_apc_test_acc}(max: {max_apc_test_acc}) ' f'APC_test_f1: {current_apc_test_f1}(max: {max_apc_test_f1})' ) if args.use_bert_spc: logger.info( f'ATE_test_F1: {current_apc_test_f1}(max: {max_apc_test_f1})' f' (Unreliable since `use_bert_spc` is "True".)' ) else: logger.info( f'ATE_test_f1: {current_ate_test_f1}(max:{max_ate_test_f1})' ) logger.info('*' * 80) return [max_apc_test_acc, max_apc_test_f1, max_ate_test_f1] return train()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=False, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=False, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = {"ner": NerProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) # + 1 pretrain_model_dir = 'bert-base-uncased' tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case) model = BertModel.from_pretrained(pretrain_model_dir, output_hidden_states=True, output_attentions=True) print(tokenizer.tokenize('unfamiliar')) print(tokenizer.tokenize('disjoint')) print( tokenizer.tokenize( "Let's see all hidden-states and attentions on this text")) input_ids = torch.tensor( tokenizer.convert_tokens_to_ids( "Let's see all hidden-states and attentions on this text".lower( ).split())) all_hidden_states, all_attentions = model(input_ids)[-2:] print(all_hidden_states[-1].shape)
# Date: 2020/12/4 # Author: Qianqian Peng from mention_detection.mention_detection import load_model import torch from transformers import BertTokenizer as BertTokenizer_new from transformers import BertConfig as BertConfig_new from transformers import BertModel as BertModel_new import torch.nn as nn from pytorch_transformers.modeling_bert import ( BertPreTrainedModel, BertConfig, BertModel, ) bert_new = BertModel_new.from_pretrained( './model/bert-large-uncased', config=BertConfig_new.from_pretrained('bert-large-uncased')) bert_old = BertModel.from_pretrained( './model/bert-large-uncased', config=BertConfig.from_pretrained('bert-large-uncased'))