def train_model_bert(args): # need remake config with device option for train with another cuda device config = BertConfig.from_pretrained(args.folder_model) config = config.to_dict() config.update({"device": args.device}) config.update({"use_pooler": args.use_pooler}) config.update({"weight_class": args.weight_class}) config.update({"output_hidden_states": args.output_hidden_states}) config = BertConfig.from_dict(config) tokenizer = BertTokenizer.from_pretrained(args.folder_model) model = BERTQa.from_pretrained(args.folder_model, config=config) model = model.to(args.device) train_squad(args, tokenizer, model)
def __init__(self, config, bertmodel): super(Parser, self).__init__() self.config = config # build and load BERT G2G model bertconfig = BertConfig.from_pretrained( config.main_path+"/model"+"/model_"+config.modelname+'/config.json') bertconfig.num_hidden_layers = config.n_attention_layer bertconfig.label_size = config.n_rels bertconfig.layernorm_value = config.layernorm_value bertconfig.layernorm_key = config.layernorm_key if self.config.input_graph: self.bert = BertGraphModel(bertconfig) else: self.bert = BertBaseModel(bertconfig) self.bert.load_state_dict(bertmodel.state_dict(),strict=False) self.mlp = Classifier(3*bertconfig.hidden_size,bertconfig.hidden_size,config.n_trans) self.mlp_rel = Classifier(2*bertconfig.hidden_size,bertconfig.hidden_size,config.n_rels) self.pad_index = config.pad_index self.unk_index = config.unk_index
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path,bert_config_file,pytorch_dump_path): config = BertConfig.from_json_file(bert_config_file) model = BertForPretraining(config) load_tf_weights_in_bert(model,config,tf_checkpoint_path) torch.save(model.state_dict(),pytorch_dump_path)
def get_model(): bert_config = BertConfig.from_json_file(self.config_path) bert_config.type_vocab_size = 3 bert_config.eos_token_id = self.tokenizer.token_to_id('[SEP]') model = GenLM(bert_config) if not is_predict: load_tf_weights_in_bert(model, self.checkpoint_path) # model = keras.models.Model(model.inputs, model.outputs) return model
def predict(predict_model_name_or_path, pre_data, pre_dataloader): print('进行预测') pro = processer() labellist = pro.get_labels() #*****加载模型***** print('加载模型') model = BertForSequenceClassification config = BertConfig.from_pretrained(predict_model_name_or_path, num_labels=len(labellist)) model = model.from_pretrained(predict_model_name_or_path, config=config) print('模型加载到GPU或者CPU') #如果有GPU,使用GPU进行分布式计算,否则使用CPU if torch.cuda.is_available(): #单GPU计算 torch.cuda.set_device(0) device = torch.device('cuda', 0) #设置GPU设备号 else: device = torch.device('cpu') model.to(device) print('******** Running prediction ********') print(" Num examples = %d", len(pre_data)) preds = None pbar = ProgressBar(n_total=len(pre_dataloader), desc="Predicting") #***进行预测*** for step, batch in enumerate(pre_dataloader): model.eval() batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { 'input_ids': batch[0], 'token_type_ids': batch[2], 'attention_mask': batch[1], 'labels': batch[3] } outputs = model(**inputs) _, logits = outputs[:2] #***汇总每个batch的预测结果*** if preds is None: preds = logits.softmax(-1).detach().cpu().numpy() else: preds = np.append(preds, logits.softmax(-1).detach().cpu().numpy(), axis=0) pbar(step) predict_label = np.argmax(preds, axis=1) print(preds) print(predict_label) return preds, predict_label
def __init__(self): config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.tokenizer = BertTokenizer(vocab_file=join(BERT_PATH, 'vocab.txt')) self.model = BertModel(config, add_pooling_layer=False) load_tf_weights_in_bert(self.model, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.model.to(PT_DEVICE) self.model.eval()
def __init(self, config=None, *args, **kwargs): super().__init__(*args, **kwargs) if config is None: from transformers.configuration_bert import BertConfig config = BertConfig.from_pretrained('bert-base-uncased') assert config.hidden_size == self.in_dim from transformers.modeling_bert import BertPredictionHeadTransform self.module = nn.Sequential( nn.Dropout(config.hidden_dropout_prob), BertPredictionHeadTransform(config), nn.Linear(self.in_dim, self.out_dim), )
def __init__(self, embeddings, device): super(Model, self).__init__() self.device = device cid_emb_size = embeddings[0].shape[1] creative_id_embedding = nn.Embedding(embeddings[0].shape[0], cid_emb_size) creative_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[0])) creative_id_embedding.weight.requires_grad = False self.creative_id_embedding = creative_id_embedding aid_emb_size = embeddings[1].shape[1] ad_id_embedding = nn.Embedding(embeddings[1].shape[0], aid_emb_size) ad_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[1])) ad_id_embedding.weight.requires_grad = False self.ad_id_embedding = ad_id_embedding adv_emb_size = embeddings[2].shape[1] advertiser_id_embedding = nn.Embedding(embeddings[2].shape[0], adv_emb_size) advertiser_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[2])) advertiser_id_embedding.weight.requires_grad = False self.advertiser_id_embedding = advertiser_id_embedding pid_emb_size = embeddings[3].shape[1] product_id_embedding = nn.Embedding(embeddings[3].shape[0], pid_emb_size) product_id_embedding.weight.data.copy_(torch.from_numpy(embeddings[3])) product_id_embedding.weight.requires_grad = False self.product_id_embedding = product_id_embedding hidden_size = cid_emb_size + aid_emb_size + adv_emb_size + pid_emb_size # transformer config = BertConfig(num_hidden_layers=3, num_attention_heads=8, hidden_size=hidden_size, layer_norm_eps=1e-12, hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2, hidden_act='mish') self.config = config self.bert_encoder = BertEncoder(config) # DNN 层 self.linears = nn.Sequential(nn.Linear(config.hidden_size, 1024), Mish(), nn.BatchNorm1d(1024), nn.Linear(1024, 256), Mish(), nn.BatchNorm1d(256), nn.Linear(256, 64), Mish(), nn.BatchNorm1d(64), nn.Linear(64, 16), Mish(), nn.BatchNorm1d(16), nn.Dropout(0.1)) # 输出层 self.age_output = nn.Linear(16, 10) self.gender_output = nn.Linear(16, 2)
def __init__(self, is_predict=False): super().__init__() config = BertConfig.from_json_file(join(BERT_PATH, 'bert_config.json')) self.bert = BertModel(config, add_pooling_layer=True) self.tokenizer = self.get_tokenizer() if not is_predict: load_tf_weights_in_bert(self.bert, tf_checkpoint_path=join( BERT_PATH, 'bert_model.ckpt'), strip_bert=True) self.cls = torch.nn.Linear(768, 2) self.save_dir = join(MODEL_PATH, 'consistent') if not os.path.isdir(self.save_dir): os.makedirs(self.save_dir) self.save_path = join(self.save_dir, 'trained.pt')
def __init__(self, in_dim=768, out_dim=2, config=None, *args, **kwargs): super().__init__() from transformers.models.bert.modeling_bert import BertPredictionHeadTransform if config is None: from transformers.configuration_bert import BertConfig config = BertConfig.from_pretrained("bert-base-uncased") assert config.hidden_size == in_dim self.module = nn.Sequential( nn.Dropout(config.hidden_dropout_prob), BertPredictionHeadTransform(config), nn.Linear(in_dim, out_dim), )
def __init__(self, device, serial_model_path, par_model_path): self.device = device pretrained_path = 'cl-tohoku/bert-base-japanese-whole-word-masking' self.tokenizer = BertTokenizer.from_pretrained(pretrained_path, do_lower_case=False) config = BertConfig.from_pretrained(pretrained_path) config.num_labels = 4 self.serial_model = BertForSequenceClassification(config) config.num_labels = 2 self.par_model = BertForSequenceClassification(config) self.serial_model.load_state_dict(torch.load(serial_model_path)) self.serial_model.to(self.device) self.serial_model.eval() self.par_model.load_state_dict(torch.load(par_model_path)) self.par_model.to(self.device) self.par_model.eval()
def classify(fname: str, verbose: bool = False): ''' Returns a 1 dimensional numpy array of predictions Currently predictions 0, -1, 1 are indexed at 0, 1, 2 Therefore when reading the return array: 0 = 'Neutral', 1 = 'Deny', 2 = 'Favor' ''' tokenizer = BertTokenizer('../models/BERT-vocab1.dms') config = BertConfig.from_json_file('../models/BERT-config0.json') model = TFBertForSequenceClassification.from_pretrained( '../models/BERT-transfer1/', config=config) # BATCH_SIZE = 64 feat_spec = { 'idx': tf.io.FixedLenFeature([], tf.int64), 'sentence': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) } def parse_ex(ex_proto): return tf.io.parse_single_example(ex_proto, feat_spec) tweets = tf.data.TFRecordDataset(fname) tweets = tweets.map(parse_ex) # with open('data/tweet_info.json')as j_file: # data_info = json.load(j_file) # num_samples = data_info['DF_length'] eval_df = glue_convert_examples_to_features(examples=tweets, tokenizer=tokenizer, max_length=128, task='sst-2', label_list=['0', '-1', '1']) eval_df = eval_df.batch(64) y_preds = model.predict(eval_df, use_multiprocessing=True, verbose=verbose) y_preds_sm = tf.nn.softmax(y_preds) y_preds_argmax = tf.math.argmax(y_preds_sm, axis=1) return y_preds_argmax.numpy()
def __init__(self, pretrained_model_dir, num_classes, segment_len=200, overlap=50, dropout_p=0.5): super(BertLSTMWithOverlap, self).__init__() self.seg_len = segment_len self.overlap = overlap self.config = BertConfig.from_json_file(pretrained_model_dir + 'bert_config.json') self.bert = BertModel.from_pretrained(pretrained_model_dir, config=self.config) if feature_extract: for p in self.bert.parameters(): # 迁移学习:bert作为特征提取器 p.requires_grad = False d_model = self.config.hidden_size # 768 self.bi_lstm2 = torch.nn.LSTM(input_size=d_model, hidden_size=d_model // 2, bidirectional=True, batch_first=True) self.attn_weights2 = torch.nn.Sequential( torch.nn.Linear( d_model, d_model), # sent_attn_energy [b,num_seg,768]=>[b,num_seg,768] torch.nn.Tanh(), torch.nn.Linear( d_model, 1, bias=False ), # sent_attn_weights [b,num_seg,768]=>[b,num_seg,1] torch.nn.Softmax(dim=1), # [b,num_seg,1] ) self.fc = torch.nn.Sequential(torch.nn.Dropout(p=dropout_p), torch.nn.Linear(d_model, num_classes))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data_dir", default='./data/input/', type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default='bert-base-chinese', type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--config_file", default='bert-base-chinese', type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='xgfy', type=str, required=True, help="The name of the task to train.") parser.add_argument("--vacab_root", default='./data/model/', type=str, required=True, help="The directory where the vocab file is saved.") parser.add_argument("--output_dir", default='./data/output/', type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--weight_name", default='net_weight_1.bin', type=str, ) parser.add_argument("--config_name", default='config_name_1.bin', type=str, ) # Other parameters parser.add_argument("--cache_dir", default="./data/model/", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--log_frq", default=50, type=int) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=1.0, type=int, help="Total number of training epochs to perform.") parser.add_argument("--n_warmup", default=1000, type=int, help="step of training to perform linear learning rate warmup for.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--parall', action='store_true') parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() # 新冠肺炎 processors = { "xgfy": SimProcessor } num_labels_task = { "xgfy": 2, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda:0" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name] num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.vacab_root, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{0}') # cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(str(args.local_rank))) config = BertConfig.from_pretrained(args.config_file, num_labels=num_labels) model = BertForSequenceClassification.from_pretrained(args.bert_model, config=config, cache_dir=cache_dir) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1 and args.parall: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.n_warmup, num_training_steps=t_total ) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1 and args.parall: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (global_step) % args.log_frq == 0: logger.info("TrLoss: {:.2f} | Loss: {:.2f} | Lr: {:.2f}".format(tr_loss, loss.item(), scheduler.get_lr()[0])) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, args.weight_name) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, args.config_name) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config) model.load_state_dict(torch.load(output_model_file)) else: output_model_file = os.path.join(args.output_dir, args.weight_name) output_config_file = os.path.join(args.output_dir, args.config_name) config = BertConfig(output_config_file) model = BertForSequenceClassification(config) model.load_state_dict(torch.load(output_model_file)) # model = BertForSequenceClassification.from_pretrained(args.bert_model) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} logger.info(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
outputs = ( start_logits, end_logits, ) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss, ) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if __name__ == '__main__': from transformers import BertConfig config = BertConfig() config.bitW = 8 config.layer_name_list = [] encoder = BertEncoder(config=config)
def init_encoder(cls, args, dropout: float = 0.1): cfg = BertConfig.from_pretrained("bert-base-uncased") if dropout != 0: cfg.attention_probs_dropout_prob = dropout cfg.hidden_dropout_prob = dropout return cls.from_pretrained("bert-base-uncased", config=cfg)
def train(args, model_name_or_path, train_data, train_dataloader, valid_data, valid_dataloader): pro = processer() labellist = pro.get_labels() trainloss = TrainLoss() #*****加载模型***** model = BertForSequenceClassification config = BertConfig.from_pretrained(model_name_or_path, num_labels=len(labellist)) model = model.from_pretrained(model_name_or_path, config=config) # *****模型加载到设备***** if torch.cuda.is_available(): # 单GPU计算 torch.cuda.set_device(0) device = torch.device('cuda', 0) # 设置GPU设备号 else: device = torch.device('cpu') model.to(device) #*****优化函数***** t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs warmup_steps = int(t_total * args.warmup_proportion) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) #*****训练过程相关信息***** logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) #*****开始训练***** tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() seed_everything(args.seed) for num in range(args.num_train_epochs): train_all_steps = 0 train_steps = [] train_losses = [] global_step = 0 logger.info(f'****************Train epoch-{num}****************') pbar = ProgressBar(n_total=len(train_dataloader), desc='Train') for step, batch in enumerate(train_dataloader): #***存储step用于绘制Loss曲线*** train_all_steps += 1 train_steps.append(train_all_steps) model.train() #***输入模型进行计算*** batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model( **inputs) #模型原文件中已经使用损失函数对输出值和标签值进行了计算,返回的outputs中包含损失函数值 #***损失函数值反向传播*** loss = outputs[0] loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #梯度裁剪 #***存储loss用于绘制loss曲线*** train_losses.append(loss.detach().cpu().numpy()) #***优化器进行优化*** pbar(step, {'loss': loss.item()}) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() #优化器优化 scheduler.step() #学习率机制更新 model.zero_grad() global_step += 1 #训练一个epoch保存一个模型 output_dir = os.path.join(args.output_dir, f'model_checkpoint_epoch_{num}') if not os.path.exists(output_dir): os.makedirs(output_dir) print('') #避免输出信息都在同一行 # logger.info(f'save model checkpoint-{global_step} to {output_dir} ') model.save_pretrained(output_dir) #保存模型 #***训练一个epoch绘制一个Loss曲线*** trainloss.train_loss(steps=train_steps, losses=train_losses, epoch=num, args=args, type='train', max_step=train_all_steps) #*****一个epoch训练结束以后,进行验证***** print('') logger.info(f'****************Valid epoch-{num}****************') logger.info(" Num examples = %d", len(valid_data)) logger.info(" Batch size = %d", args.valid_batch_size) valid_steps, valid_losses, valid_all_steps = valid( args=args, model=model, device=device, valid_data=valid_data, valid_dataloader=valid_dataloader) trainloss.train_loss(steps=valid_steps, losses=valid_losses, epoch=num, args=args, type='valid', max_steps=valid_all_steps) #每训练一个epoch清空cuda缓存 if 'cuda' in str(device): torch.cuda.empty_cache()
# In[ ]: import numpy as np import json from sklearn.model_selection import train_test_split import tensorflow as tf from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features from transformers.configuration_bert import BertConfig # In[ ]: tokenizer = BertTokenizer('../models/BERT-vocab1.dms') config = BertConfig.from_json_file('../models/BERT-config0.json') model = TFBertForSequenceClassification.from_pretrained( '../models/BERT-transfer1', config=config) # In[ ]: fname = '../data/prelabeled/test47_even.tfrecord' # BATCH_SIZE = 64 feat_spec = { 'idx': tf.io.FixedLenFeature([], tf.int64), 'sentence': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64) }
def main(): parser = ArgumentParser() parser.add_argument("--model_name", type=str, required=True, choices=["GMMBert", "LogBert", "ExpBert", "FlowBert", "DisBert"]) parser.add_argument("--dataset", type=str, required=True, choices=["fin-all", "fin-dol", "sci-doc"]) parser.add_argument('--saved_checkpoint', type=str, default=None, required=False) parser.add_argument("--bert_model", type=str, default='bert-base-uncased', help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument('--do_lower_case', type=str_to_bool, default=True, help="Lower case the text and model.") parser.add_argument('--do_pretrain', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters.") parser.add_argument('--do_pretrain_wpe', type=str_to_bool, default=True, help="Use a pretrained Bert Parameters only for wpe embeddings") parser.add_argument('--log_criterion', type=str, default='L1', choices=["L1", "L2", ''], help="Loss function to use for LogBert") parser.add_argument('--do_gmm', type=str_to_bool, default=False, help="Use the Gaussian mixture model components.") parser.add_argument('--do_log', type=str_to_bool, default=False, help="Do L2 over the numbers in logspace") parser.add_argument('--do_dis', type=str_to_bool, default=False, help="Discriminative baseline") parser.add_argument('--do_anomaly', type=str_to_bool, default=True, help="Do anomaly evaluation") parser.add_argument('--do_exp', type=str_to_bool, default=False, help="Latent Exponent Model") parser.add_argument('--exp_truncate', type=str_to_bool, default=True, help="Use a truncated normal distribution.") parser.add_argument('--do_flow', type=str_to_bool, default=False, help="Do flow over the numbers in logspace") parser.add_argument('--flow_criterion', type=str, default='L1', choices=["L1", "L2", ''], help="Loss function to use for 'Flow'Bert") parser.add_argument('--flow_v', type=str, default='', choices=['1a', '1b', '2a', '2b', ''], help="Mode for 'Flow'Bert") parser.add_argument('--flow_fix_mu', type=str_to_bool, default=False, help="Use a fixed mu for flow model") parser.add_argument("--flow_scale", type=float, default=10.0) parser.add_argument("--exp_logvar_scale", type=float, default=-5.0) parser.add_argument("--exp_logvar", type=str_to_bool, default=False) parser.add_argument("--drop_rate", type=float, default=0.0, help='Droprate of 0 is no droprate') parser.add_argument("--do_eval", type=str_to_bool, default=False) parser.add_argument("--do_test", type=str_to_bool, default=False) parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--patience", type=int, default=3, help="Number of early stop epochs patience ") parser.add_argument("--epochs", type=int, default=10, help="Number of epochs to train for") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=512, type=int, help="Total batch size for training.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--lr_bert", default=3e-5, type=float, help="The initial learning rate for Adam for bert params") parser.add_argument("--lr_mlp", default=3e-5, type=float) parser.add_argument("--weight_decay", default=0.01, type=float, help="Adam's weight l2 regularization") parser.add_argument("--clip_grad", default=5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gmm_crossentropy', type=str_to_bool, default=False, help="GMM Crossentropy.") parser.add_argument('--gmm_exponent', type=str_to_bool, default=True, help="Instead of Kernels use powers of 10") parser.add_argument('--gmm_nmix', type=int, default=31, help="number of mixtures used only for gmm. [1,3,7,15,31,63,127,255,511]") parser.add_argument('--optim', type=str, default='sgd', choices=['sgd', 'adam'], help="Loss function to use for LogBert") parser.add_argument('--min_exponent', type=int, default=-1, help="min exponent size") parser.add_argument('--max_exponent', type=int, default=16, help="max exponent size") parser.add_argument('--n_exponent', type=int, default=17, help="sum of min and max") parser.add_argument('--embed_exp', type=str_to_bool, default=False, help="Learn an input exponent embedding") parser.add_argument('--embed_exp_opt', type=str, default='high', choices=['low', 'high', ''], help="high or low learning rate for embeddings") parser.add_argument('--embed_digit', type=str_to_bool, default=False, help="Learn in input embedding of numbers using LSTM over digits") parser.add_argument('--output_embed_exp', type=str_to_bool, default=False, help="Learn in input embedding and attach after Bert") parser.add_argument('--zero_init', type=str_to_bool, default=False, help="Start non pretrained embeddings at zero") parser.add_argument("--n_digits", type=int, default=14, help="Size of digit vocab includes e.+-") parser.add_argument("--ez_digits", type=int, default=32, help="Digit embedding size") args = parser.parse_args() args.pregenerated_data = Path(PREGENERATED_DATA[args.dataset]) args.output_dir = Path(f'{CHECKPOINT_PATH}/{args.dataset}') sanity_check(args) args.savepath = args.output_dir if args.saved_checkpoint is not None: args.output_dir = Path(args.saved_checkpoint) args.run_name = args.output_dir.stem num_data_epochs = 1 else: args.output_dir, args.run_name = build_savepath(args) print('dataset', args.dataset) print('output_dir', args.output_dir) print('pregenerated_data', args.pregenerated_data) print('run_name', args.run_name) wandb.init(project="mnm-paper", name=f'{args.run_name}') wandb.config.update(args, allow_val_change=True) samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"train_epoch_{i}.json" metrics_file = args.pregenerated_data / f"train_epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: print(f'epoch_file:{epoch_file}') exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logging.info("device: {} n_gpu: {}".format( device, n_gpu)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) # Prepare model NumberBertModel = get_model(args) if args.do_test: best_model, tokenizer, best_path = load_best(args) global_step = 0 train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') train_mean, train_median = np.mean(train_numbers), np.median(train_numbers) best_model.to(device) best_model.eval() if args.do_dis: test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) else: test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) save_results(best_path, test_metrics) save_args(best_path, args) return early_stopper = EarlyStopping('valid_one_loss', min_delta=0.0, patience=args.patience, monitor_mode='min') if args.saved_checkpoint is not None: print('args.saved_checkpoint', args.saved_checkpoint) tokenizer = BertNumericalTokenizer.from_pretrained(args.saved_checkpoint) model = NumberBertModel.from_pretrained(args.saved_checkpoint, args=args) #uncomment this train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') else: tokenizer = BertNumericalTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_numbers = get_numbers_from_split(args, tokenizer, device, num_data_epochs, split='train') # old_save_dir = None if args.do_pretrain: model = NumberBertModel.from_pretrained(args.bert_model, args=args) else: config = BertConfig.from_json_file('./bert-base-uncased-config.json') model = NumberBertModel(config, args) if args.do_pretrain_wpe: pre_model = NumberBertModel.from_pretrained(args.bert_model, args=args) # pretrained_dict = pretrained_dict = pre_model.state_dict() # print('pretrained_dict', pretrained_dict) pretrained_dict = {k: v for k, v in pretrained_dict.items() if 'embedding' in k} model_dict = model.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} # 2. overwrite entries in the existing state dict model_dict.update(pretrained_dict) # 3. load the new state dict model.load_state_dict(model_dict) if args.do_gmm: kernel_locs, kernel_scales = get_gmm_components(args, train_numbers) model.set_kernel_locs(kernel_locs, kernel_scales) special_tokens_dict = {'additional_special_tokens': ('[UNK_NUM]',)} num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) print('We have added', num_added_toks, 'tokens') model.resize_token_embeddings(len(tokenizer)) # model.set_params(args) def set_dropout(model, drop_rate): for name, child in model.named_children(): if isinstance(child, torch.nn.Dropout): child.p = drop_rate set_dropout(child, drop_rate=drop_rate) set_dropout(model, drop_rate=args.drop_rate) wandb.watch(model, log="all") model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = set_lr(args, param_optimizer) if args.optim == 'sgd': optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=args.lr_bert) elif args.optim == 'adam': optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr_bert, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) train_mean, train_median = np.mean(train_numbers), np.median(train_numbers) if args.do_eval: model.eval() if args.do_dis: train_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers) valid_epoch_metrics = evaluate_discriminative(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) else: # evaluation(args, model, tokenizer, device, global_step, 'train', train_mean, train_median, train_numbers) # valid_epoch_metrics = evaluation(args, model, tokenizer, device, global_step, 'valid', train_mean, train_median, train_numbers) #EMNLP FINAL test_metrics = evaluation(args, model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) return model.train() global_step = train_loop(args, model, optimizer, scheduler, tokenizer, device, optimizer_grouped_parameters, early_stopper, train_numbers, train_mean, train_median, global_step, n_gpu, num_data_epochs) del model best_model, tokenizer, best_path = load_best(args) best_model.to(device) best_model.eval() if args.do_dis: test_metrics = evaluate_discriminative(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) else: test_metrics = evaluation(args, best_model, tokenizer, device, global_step, 'test', train_mean, train_median, train_numbers) save_results(best_path, test_metrics) save_args(best_path, args) #flush check wandb.log({})
end_logits = end_logits.squeeze(-1) outputs = ( start_logits, end_logits, ) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss, ) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) if __name__ == '__main__': from transformers import BertConfig config = BertConfig() encoder = BertEncoder(config=config)
val_clean_ds = val_parse_ds test_clean_ds = test_parse_ds with open('data/info.json') as json_file: data_info = json.load(json_file) train_examples = data_info['train_length'] valid_examples = data_info['validation_length'] test_examples = data_info['test_length'] USE_XLA = False USE_AMP = False tf.config.optimizer.set_jit(USE_XLA) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig("bert_config.json") model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config) #Training Dataset train_dataset = glue_convert_examples_to_features(examples=tr_clean_ds, tokenizer=tokenizer, max_length=128, task='sst-2', label_list=['1', '3']) train_dataset = train_dataset.shuffle(train_examples).batch(BATCH_SIZE).repeat( -1) #Validation Dataset valid_dataset = glue_convert_examples_to_features(examples=val_clean_ds, tokenizer=tokenizer,
path_input_test_data = "../module_dataset/dataset/dataset_preprocess/pair_sequence/test_data/" \ "private_test_pair_without_punc.csv" no_cuda = False n_gpu = 1 device = "cuda:0" seed = 42 max_seq_length = 400 max_query_length = 64 weight_class = [1, 1] args = Args() device = torch.device(args.device) tokenizer = BertTokenizer.from_pretrained(args.folder_model, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.folder_model) # # custom some parameter for custom bert config = config.to_dict() config.update({"device": args.device}) config = BertConfig.from_dict(config) model = BERTQa.from_pretrained(args.folder_model, config=config) model = model.to(device) get_predict_dl(model, tokenizer, args)
def loss(self, input_ids, attention_mask, token_type_ids, label): target = label final_output = self.compute(input_ids, attention_mask, token_type_ids) if self.use_pooler: logits = self.qa_outputs(final_output) else: logits = self.qa_outputs_cat(final_output) class_weights = torch.FloatTensor(self.weight_class).to(self.device) loss = F.cross_entropy(logits, target, weight=class_weights) predict_value = torch.max(logits, 1)[1] list_predict = predict_value.cpu().numpy().tolist() list_target = target.cpu().numpy().tolist() return loss, list_predict, list_target if __name__ == '__main__': from transformers.configuration_bert import BertConfig config = BertConfig.from_pretrained("bert-base-multilingual-uncased", cache_dir="../resources/cache_model") config = config.to_dict() config.update({"weight_class": [1, 1]}) config = BertConfig.from_dict(config) # model = BERTQa.from_pretrained("bert-base-multilingual-uncased", # cache_dir="../resources/cache_model", config=config)
#%% import torch from torch.optim import Adam from transformers.configuration_albert import AlbertConfig from transformers.configuration_bert import BertConfig from src.dataloader.Dataset import EETaskDataloader from src.dataloader.utils import load_data from src.model.AlbertCRF import AlbertCrfForNer from src.model.BertCRF import BertCrfForNer from src.model.BertSoftMax import BertSoftmaxForNer from src.util.EETaskRun import Run from src.util.extract_arguments import extract_arguments_crf, extract_arguments_softmax from src.util.utils import lcs #%% config = BertConfig.from_pretrained( r"/home/longred/lic2020_baselines/chinese_L-12_H-768_A-12/bert-base-chinese-config.json" ) config.pretrained_path = r"/home/longred/lic2020_baselines/chinese_L-12_H-768_A-12/bert-base-chinese-pytorch_model.bin" config.vocab_path = r"/home/longred/lic2020_baselines/chinese_L-12_H-768_A-12/vocab.txt" config.train_data_path = r"/home/longred/EETask/data/train.json" config.batch_size = 32 config.event_schema_path = r"/home/longred/EETask/data/event_schema.json" device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') EE = EETaskDataloader(config) train_loader = EE.get_train_data_loader() config.num_labels = EE.num_labels config.label2id = EE.label2id data = load_data("/home/longred/EETask/data/dev.json") model = BertCrfForNer.from_pretrained( pretrained_model_name_or_path=config.pretrained_path,