def __init__(self, mask_rate, bert_model, do_lower_case, max_seq_length): super(RandMask, self).__init__() self.mask_rate = mask_rate self.max_seq_length = max_seq_length self.tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case=do_lower_case) self.vocab = list(self.tokenizer.vocab.keys())
def __init__(self, mask_rate, top_sen_rate, threshold, bert_model, do_lower_case, max_seq_length, label_list, sen_batch_size, use_gpu=True): super(SC, self).__init__() self.mask_rate = mask_rate self.top_sen_rate = top_sen_rate self.threshold = threshold self.label_list = label_list self.num_labels = len(self.label_list) self.max_seq_length = max_seq_length self.tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case=do_lower_case) self.model = BertForSequenceClassification.from_pretrained( bert_model, num_labels=self.num_labels) self.device = torch.device( "cuda" if torch.cuda.is_available() and use_gpu else "cpu") self.model.to(self.device) self.n_gpu = torch.cuda.device_count() self.sen_batch_size = sen_batch_size self.vocab = list(self.tokenizer.vocab.keys()) if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model)
def get_preprocessor(ptr_config_info, model_config): with open(ptr_config_info.vocab, mode='rb') as io: vocab = pickle.load(io) if model_config.type == 'etri': ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config_info.tokenizer, do_lower_case=False) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) elif model_config.type == 'skt': ptr_tokenizer = SentencepieceTokenizer(ptr_config_info.tokenizer) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence) return preprocessor
def __init__(self, mask_rate, bert_model, do_lower_case, max_seq_length, sen_batch_size, with_rand=False, use_gpu=True): super(ModelGen, self).__init__() self.mask_rate = mask_rate self.max_seq_length = max_seq_length self.tokenizer = BertTokenizer.from_pretrained( bert_model, do_lower_case=do_lower_case) self.model = BertForTokenClassification.from_pretrained(bert_model, num_labels=2) self.device = torch.device( "cuda" if torch.cuda.is_available() and use_gpu else "cpu") self.model.to(self.device) self.n_gpu = torch.cuda.device_count() self.sen_batch_size = sen_batch_size self.vocab = list(self.tokenizer.vocab.keys()) self.with_rand = with_rand if self.n_gpu > 1: self.model = torch.nn.DataParallel(self.model)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) train_dataloader = prepare_data(train_examples, tokenizer, args, task_name="train") num_train_optimization_steps = int( len(train_dataloader) / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) def eval_model(model, data_loader): eval_loss = 0 model.eval() for step, batch in enumerate( tqdm(data_loader, desc="Evaluating", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. eval_loss += loss.detach().cpu() return eval_loss / (step + 1) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_dataloader = prepare_data(eval_examples, tokenizer, args, task_name='eval') if args.do_train: param_optimizer = list(model.named_parameters()) param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) model.train() train_loss = 0 min_loss = 100000 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Training", disable=args.local_rank not in [-1, 0])): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch # print(f'start_positions = {start_positions}') # print(f'end_positions = {end_positions}') loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() train_loss += loss.detach().cpu() eval_loss = eval_model(model, eval_dataloader) print( f'epoch {epoch} train loss {train_loss / (step + 1)} eval_loss {eval_loss}' ) if eval_loss < min_loss: min_loss = eval_loss model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) elif args.do_predict: predict_dataloader, predict_features = prepare_data( eval_examples, tokenizer, args, task_name='predict') all_results = [] logger.info("Start Predicting") for input_ids, input_mask, segment_ids, example_indices in tqdm( predict_dataloader, desc="Predicting", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist( ) # 某一句话的答案起始位置, 长度为len end_logits = batch_end_logits[i].detach().cpu().tolist() predict_features = predict_features[ example_index.item()] # 某句话的特征 unique_id = int(predict_features.unique_id) # 某句话的id all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, predict_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) else: raise ValueError( 'please confirm at least one task mode, such as ’train‘ or ’predict‘.' )
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument("--ckpt", default="", type=str) parser.add_argument( "--vocab_file", default="", type=str, ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--sample_weight', type=float, default=1) parser.add_argument("--save_all", action="store_true") args = parser.parse_args() processors = {"maskgen": MaskGenProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) logger.warning( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() if args.vocab_file: tokenizer = BertTokenizer(args.vocab_file, args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) # Prepare model model = BertForTokenClassification.from_pretrained(args.bert_model, num_labels=num_labels) if args.ckpt: print("load from", args.ckpt) model_dict = model.state_dict() ckpt = torch.load(args.ckpt) pretrained_dict = ckpt['model'] new_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict.keys() } model_dict.update(new_dict) print('Total : {}, update: {}'.format(len(pretrained_dict), len(new_dict))) model.load_state_dict(model_dict) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) if args.fp16: sample_weight = torch.HalfTensor([1.0, args.sample_weight]).cuda() else: sample_weight = torch.FloatTensor([1.0, args.sample_weight]).cuda() cached_train_features_file = os.path.join( args.data_dir, 'train_{}_{}_{}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: logger.info("Load from cache dir: {}".format( cached_train_features_file)) train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info("Saving train features into cached file {}".format( cached_train_features_file)) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) label_map = {i: label for i, label in enumerate(label_list, 1)} logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) os.makedirs(os.path.join(args.output_dir, "all_models"), exist_ok=True) model.train() for e in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids, weight=sample_weight) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # save each epoch model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "all_models", "e{}_{}".format(e, WEIGHTS_NAME)) torch.save(model_to_save.state_dict(), output_model_file) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForTokenClassification.from_pretrained( args.bert_model, num_labels=num_labels) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): best_f1 = 0 best_epoch = 0 val_res_file = os.path.join(args.output_dir, "valid_results.txt") val_f = open(val_res_file, "w") logger.info("***** Dev Eval results *****") for e in range(int(args.num_train_epochs)): weight_path = os.path.join(args.output_dir, "all_models", "e{}_{}".format(e, WEIGHTS_NAME)) model.load_state_dict(torch.load(weight_path)) model.to(device) eval_examples = processor.get_dev_examples(args.data_dir) cached_eval_features_file = os.path.join( args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_eval_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_eval_features_file) with open(cached_eval_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler( eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() y_true_L = [] y_pred_L = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() input_mask = input_mask.to('cpu').numpy() y_true = [[str(x) for x in L] for L in label_ids] y_pred = [[str(x) for x in L] for L in logits] for (m, t, p) in zip(input_mask, y_true, y_pred): for mm, tt, pp in zip(m, t, p): if mm == 1: y_true_L.append(int(tt)) y_pred_L.append(int(pp)) acc = accuracy_score(y_true_L, y_pred_L) f1 = f1_score(y_true_L, y_pred_L) recall = recall_score(y_true_L, y_pred_L) prec = precision_score(y_true_L, y_pred_L) if f1 > best_f1: best_f1 = f1 best_epoch = e result = {"acc": acc, "f1": f1, "recall": recall, "prec": prec} logger.info("Epoch {}".format(e)) val_f.write("Epoch {}\n".format(e)) for key in sorted(result.keys()): logger.info("{} = {}".format(key, str(result[key]))) val_f.write("{} = {}\n".format(key, str(result[key]))) val_f.write("\n") logger.info("\nBest epoch: {}. Best val f1: {}".format( best_epoch, best_f1)) val_f.write("Best epoch: {}. Best val f1: {}\n".format( best_epoch, best_f1)) val_f.close() best_weight_path = os.path.join( args.output_dir, "all_models", "e{}_{}".format(best_epoch, WEIGHTS_NAME)) best_model_dir = os.path.join(args.output_dir, "best_model") os.makedirs(best_model_dir, exist_ok=True) os.system("cp {} {}/{}".format(best_weight_path, best_model_dir, WEIGHTS_NAME)) with open(os.path.join(best_model_dir, CONFIG_NAME), 'w') as f: f.write(model_to_save.config.to_json_string()) tokenizer.save_vocab(os.path.join(best_model_dir, VOCAB_NAME)) if not args.save_all: os.system("rm -r {}".format( os.path.join(args.output_dir, "all_models")))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument("--vocab_file", default="", type=str) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--ckpt", type=str, help="ckpt position") parser.add_argument("--save_all", action="store_true") parser.add_argument("--output_dev_detail", action="store_true") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab if args.vocab_file: tokenizer = BertTokenizer(args.vocab_file, args.do_lower_case) else: tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) if args.ckpt: print("load from", args.ckpt) model_dict = model.state_dict() ckpt = torch.load(args.ckpt) if "model" in ckpt: pretrained_dict = ckpt['model'] else: pretrained_dict = ckpt new_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict.keys() and k not in ["classifier.weight", "classifier.bias"] } model_dict.update(new_dict) print('Total : {}, update: {}'.format(len(pretrained_dict), len(new_dict))) model.load_state_dict(model_dict) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: # Prepare data loader train_examples = processor.get_train_examples(args.data_dir) print(len(train_examples)) cached_train_features_file = os.path.join( args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = [f.input_ids for f in train_features] all_input_mask = [f.input_mask for f in train_features] all_segment_ids = [f.segment_ids for f in train_features] if output_mode == "classification": all_label_ids = [f.label_id for f in train_features] elif output_mode == "regression": all_label_ids = [f.label_id for f in train_features] train_data = InputDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=train_data.collate) num_train_optimization_steps = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) os.makedirs(os.path.join(args.output_dir, "all_models"), exist_ok=True) model.train() for e in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): inputs, labels = batch for key in inputs.keys(): inputs[key] = inputs[key].to(args.device) for key in labels.keys(): labels[key] = labels[key].to(args.device) # define a new function to compute loss values for both output_modes label_ids = labels["labels"] logits = model(**inputs) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # save each epoch model_to_save = model.module if hasattr(model, 'module') else model output_model_file = os.path.join(args.output_dir, "all_models", "e{}_{}".format(e, WEIGHTS_NAME)) torch.save(model_to_save.state_dict(), output_model_file) ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): best_acc = 0 best_epoch = 0 val_res_file = os.path.join(args.output_dir, "valid_results.txt") val_f = open(val_res_file, "w") if args.output_dev_detail: logger.info("***** Dev Eval results *****") for e in tqdm(range(int(args.num_train_epochs)), desc="Epoch on dev"): weight_path = os.path.join(args.output_dir, "all_models", "e{}_{}".format(e, WEIGHTS_NAME)) result = evaluate(args, model, weight_path, processor, device, task_name, "dev", label_list, tokenizer, output_mode, num_labels, show_detail=False) if result["acc"] > best_acc: best_acc = result["acc"] best_epoch = e if args.output_dev_detail: logger.info("Epoch {}".format(e)) val_f.write("Epoch {}\n".format(e)) for key in sorted(result.keys()): if args.output_dev_detail: logger.info("{} = {}".format(key, str(result[key]))) val_f.write("{} = {}\n".format(key, str(result[key]))) val_f.write("\n") logger.info("\nBest epoch: {}. Best val acc: {}".format( best_epoch, best_acc)) val_f.write("Best epoch: {}. Best val acc: {}\n".format( best_epoch, best_acc)) val_f.close() test_weight_path = os.path.join( args.output_dir, "all_models", "e{}_{}".format(best_epoch, WEIGHTS_NAME)) test_result = evaluate(args, model, test_weight_path, processor, device, task_name, "test", label_list, tokenizer, output_mode, num_labels) test_res_file = os.path.join(args.output_dir, "test_results.txt") logger.info("***** Test Eval results *****") with open(test_res_file, "w") as test_f: for key in sorted(test_result.keys()): logger.info("{} = {}".format(key, str(test_result[key]))) test_f.write("{} = {}\n".format(key, str(test_result[key]))) best_model_dir = os.path.join(args.output_dir, "best_model") os.makedirs(best_model_dir, exist_ok=True) os.system("cp {} {}/{}".format(test_weight_path, best_model_dir, WEIGHTS_NAME)) with open(os.path.join(best_model_dir, CONFIG_NAME), 'w') as f: f.write(model_to_save.config.to_json_string()) tokenizer.save_vocab(os.path.join(best_model_dir, VOCAB_NAME)) if not args.save_all: os.system("rm -r {}".format( os.path.join(args.output_dir, "all_models")))
zipfile_path = ptr_dir / "etri.zip" if not zipfile_path.exists(): url = "https://drive.google.com/uc?id=1qVY-zZc2O2OliGNUwWClhcqJkLG_6uoD" gdown.download(url, output=str(zipfile_path)) with zipfile.ZipFile(str(zipfile_path)) as unzip: unzip.extractall(str(ptr_dir)) from model.tokenization import BertTokenizer as ETRITokenizer # loading BertTokenizer ptr_config_path = ptr_dir / 'bert_config_etri.json' ptr_tokenizer_path = ptr_dir / "vocab.korean.rawtext.list" ptr_bert_path = ptr_dir / "pytorch_model_etri.bin" ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_tokenizer_path, do_lower_case=False) # generate vocab idx_to_token = list(ptr_tokenizer.vocab.keys()) token_to_idx = { token: idx for idx, token in enumerate(idx_to_token) } vocab = Vocab( idx_to_token, padding_token="[PAD]", unknown_token="[UNK]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=token_to_idx,