def __init__(self, **kwargs): super(WordPieceVectorizer1D, self).__init__(kwargs.get('transform_fn')) global BERT_TOKENIZER self.max_seen = 128 handle = kwargs.get('embed_file') if BERT_TOKENIZER is None: BERT_TOKENIZER = BertTokenizer.from_pretrained(handle) self.tokenizer = BERT_TOKENIZER self.mxlen = kwargs.get('mxlen', -1)
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, max_pieces: int = 512) -> None: bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lowercase) super().__init__(vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces)
def __init__(self, name, **kwargs): super(BERTBaseEmbeddings, self).__init__(name=name, **kwargs) global BERT_TOKENIZER self.dsz = kwargs.get('dsz') if BERT_TOKENIZER is None: BERT_TOKENIZER = BertTokenizer.from_pretrained(kwargs.get('embed_file')) self.model = BertModel.from_pretrained(kwargs.get('embed_file')) self.vocab = BERT_TOKENIZER.vocab self.vsz = len(BERT_TOKENIZER.vocab) # 30522 self.model.embeddings.word_embeddings.num_embeddings self.layer_indices = kwargs.get('layers', [-1, -2, -3, -4]) self.operator = kwargs.get('operator', 'concat')
def __init__(self, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = True, never_lowercase: List[str] = None, max_pieces: int = 512) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning("Your BERT model appears to be cased, " "but your indexer is lowercasing tokens.") elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") bert_tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=do_lowercase) super().__init__(vocab=bert_tokenizer.vocab, wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize, namespace="bert", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=["[CLS]"], end_tokens=["[SEP]"])
from gevent.pywsgi import WSGIServer from datetime import datetime from finbert_utils import * import pandas as pd import json import os ################################################################################################### DIR = os.path.realpath(os.path.dirname(__file__)) CHUNK_SIZE = 25 model = BertForSequenceClassification.from_pretrained(f"{DIR}/sentiment_model", num_labels=3, cache_dir=None) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") app = Flask(__name__) ################################################################################################### def predict(sentences): """ Not my code. See https://github.com/ProsusAI/finBERT/blob/fcec6c5db7604606ae3ca1cb0db5f60bf8546cbb/predict.py for reference Predict sentiments of sentences in a given text. The function first tokenizes sentences, make predictions and write results. Parameters ----------
label_list = conllProcessor.get_labels() label_map = conllProcessor.get_label_map() train_examples = conllProcessor.get_train_examples(data_dir) dev_examples = conllProcessor.get_dev_examples(data_dir) test_examples = conllProcessor.get_test_examples(data_dir) total_train_steps = int( len(train_examples) / batch_size / gradient_accumulation_steps * total_train_epochs) print("***** Running training *****") print(" Num examples = %d" % len(train_examples)) print(" Batch size = %d" % batch_size) print(" Num steps = %d" % total_train_steps) tokenizer = BertTokenizer.from_pretrained(bert_model_scale, do_lower_case=do_lower_case) train_dataset = NerDataset(train_examples, tokenizer, label_map, max_seq_length) dev_dataset = NerDataset(dev_examples, tokenizer, label_map, max_seq_length) test_dataset = NerDataset(test_examples, tokenizer, label_map, max_seq_length) train_dataloader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, collate_fn=NerDataset.pad) dev_dataloader = data.DataLoader(dataset=dev_dataset,
def main(): # def main(args): parser = setup_parser() args = parser.parse_args() # specifies the path where the biobert or clinical bert model is saved if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert' or args.bert_model == 'stroke_bert': args.bert_model = args.model_loc print(args.bert_model) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "mednli": MedNLIProcessor, "carotid": CaroditProcessor } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "carotid": 17 } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) print('TRAIN') train = processor.get_train_examples(args.data_dir) print([(train[i].text_a, train[i].text_b, train[i].label) for i in range(3)]) print('DEV') dev = processor.get_dev_examples(args.data_dir) print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)]) print('TEST') test = processor.get_test_examples(args.data_dir) print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)]) train_examples = None num_train_optimization_steps = -1 if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) if task_name == 'carotid': model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, task_name) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if task_name == 'carotid': all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) else: all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * WarmupLinearSchedule( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) if task_name == 'carotid': model = BertForMultiLabelSequenceClassification( config, num_labels=num_labels) else: model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: if task_name == 'carotid': model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, task_name) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if task_name == 'carotid': all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) else: all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) all_logits = None all_labels = None model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) if task_name == 'carotid': if all_logits is None: all_logits = logits.detach().cpu().numpy() else: all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) if all_labels is None: all_labels = label_ids.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, label_ids.detach().cpu().numpy()), axis=0) else: logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 if task_name == 'carotid': fpr = dict() tpr = dict() roc_auc = dict() for i in range(num_labels): fpr[i], tpr[i], _ = roc_curve(all_labels[:, i], all_logits[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(all_labels.ravel(), all_logits.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) save_path = os.path.join(args.output_dir, "eval_prediction.pickle") predic_result = { 'all_logits': all_logits, 'all_labels': all_labels } with open(save_path, 'wb') as file_pi: pickle.dump(predic_result, file_pi) result = {'eval_loss': eval_loss, 'roc_auc': roc_auc} else: eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer, task_name) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) if task_name == 'carotid': all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.float) else: all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) all_logits = None all_labels = None model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) if task_name == 'carotid': if all_logits is None: all_logits = logits.detach().cpu().numpy() else: all_logits = np.concatenate( (all_logits, logits.detach().cpu().numpy()), axis=0) if all_labels is None: all_labels = label_ids.detach().cpu().numpy() else: all_labels = np.concatenate( (all_labels, label_ids.detach().cpu().numpy()), axis=0) else: logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 if task_name == 'carotid': fpr = dict() tpr = dict() roc_auc = dict() for i in range(num_labels): fpr[i], tpr[i], _ = roc_curve(all_labels[:, i], all_logits[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(all_labels.ravel(), all_logits.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) save_path = os.path.join(args.output_dir, "test_prediction.pickle") predic_result = { 'all_logits': all_logits, 'all_labels': all_labels } with open(save_path, 'wb') as file_pi: pickle.dump(predic_result, file_pi) result = {'test_loss': test_loss, 'roc_auc': roc_auc} else: test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--trained_model_dir", default="", type=str, help="Where is the fine-tuned BERT model?") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): #raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) print("WARNING: Output directory already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: train_dataset = BERTDataset(args.data_dir, tokenizer, seq_len=args.max_seq_length) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model if args.trained_model_dir: if os.path.exists(os.path.join(args.output_dir, WEIGHTS_NAME)): previous_state_dict = torch.load( os.path.join(args.output_dir, WEIGHTS_NAME)) else: from collections import OrderedDict previous_state_dict = OrderedDict() distant_state_dict = torch.load( os.path.join(args.trained_model_dir, WEIGHTS_NAME)) previous_state_dict.update( distant_state_dict ) # note that the final layers of previous model and distant model must have different attribute names! model = MyBertForMaskedLM.from_pretrained( args.trained_model_dir, state_dict=previous_state_dict) else: model = MyBertForMaskedLM.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FusedAdam from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string())
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() # processorで前処理してる→してる output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples( args.data_dir) # ここでInputExampleのリストを返している。 num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) #こんなクラスもあるのか if args.fp16: model.half() # 16bit精度にしてる。 model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) #DDPとはなにか elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: # apex A Python EXtentionというライブラリがあって raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: # apexがなくてもoptimizerは存在する。 optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: # ここがfinetuneの本処理かな train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() #train modeにセット for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): args = process_args() os.makedirs(args.output_dir, exist_ok=True) if args.enable_butd: if args.visdial_v == '1.0' and not args.no_vision: assert (args.len_vis_input == 36) elif args.visdial_v == '0.9': assert (args.len_vis_input == 100) args.region_bbox_file = os.path.join(args.image_root, args.region_bbox_file) args.region_det_file_prefix = os.path.join( args.image_root, args.region_det_file_prefix) if args.dataset in ( 'cc', 'coco') and args.region_det_file_prefix != '' else '' device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) args.max_seq_length = args.len_vis_input + 2 + args.max_len_hist_ques + 2 + args.max_len_ans + 1 tokenizer.max_len = args.max_seq_length bi_uni_pipeline = [ Preprocess4TestVisdialDisc( list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, args.max_seq_length, new_segment_ids=args.new_segment_ids, truncate_config={ 'len_vis_input': args.len_vis_input, 'max_len_hist_ques': args.max_len_hist_ques, 'max_len_ans': args.max_len_ans }, mode="bi", region_bbox_file=args.region_bbox_file, region_det_file_prefix=args.region_det_file_prefix, image_features_hdfpath=args.image_features_hdfpath, visdial_v=args.visdial_v, pad_hist=args.pad_hist, inc_full_hist=args.inc_full_hist, only_qa=args.only_qa) ] amp_handle = None if args.fp16 and args.amp: from apex import amp amp_handle = amp.init(enable_caching=True) logger.info("enable fp16 with amp") # Prepare model cls_num_labels = 2 type_vocab_size = 6 if args.new_segment_ids else 2 logger.info('Attempting to recover models from: {}'.format( args.model_recover_path)) if 0 == len(glob.glob(args.model_recover_path.strip())): logger.error('There are no models to recover. The program will exit.') sys.exit(1) for model_recover_path in glob.glob(args.model_recover_path.strip()): logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForPreTrainingLossMask.from_pretrained( args.bert_model, state_dict=model_recover, num_labels=cls_num_labels, type_vocab_size=type_vocab_size, task_idx=0, max_position_embeddings=512, cache_dir=args.output_dir + '/.pretrained_model_{}'.format(-1), drop_prob=args.drop_prob, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input, visdial_v=args.visdial_v, loss_type=args.loss_type, eval_disc=True, add_attn_fuse=args.add_attn_fuse, no_vision=args.no_vision) del model_recover if args.fp16: model.half() # cnn.half() model.to(device) # cnn.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # cnn = torch.nn.DataParallel(cnn) torch.cuda.empty_cache() model.eval() def read_data(src_file): eval_lst = [] with open(src_file, "r", encoding='utf-8') as f_src: data = json.load(f_src)['data'] dialogs = data['dialogs'] questions = data['questions'] answers = data['answers'] img_idx = 0 for dialog in tqdm(dialogs): if img_idx < args.use_num_imgs or args.use_num_imgs == -1: img_id = dialog['image_id'] cap_tokens = tokenizer.tokenize(dialog['caption']) ques_id = [ item['question'] for item in dialog['dialog'] ] ques_tokens = [ tokenizer.tokenize(questions[id] + '?') for id in ques_id ] ans_id = [item['answer'] for item in dialog['dialog']] ans_tokens = [ tokenizer.tokenize(answers[id]) for id in ans_id ] gt_id = [item['gt_index'] for item in dialog['dialog']] ans_opts = [ item['answer_options'] for item in dialog['dialog'] ] ans_opts_tokens = [[ tokenizer.tokenize(answers[id]) for id in ans ] for ans in ans_opts] assert len(ques_tokens) == len(ans_tokens) == len(ans_opts_tokens) == 10, \ "ques num: %d, ans num: %d, ans opt num: %d" % ( len(ques_tokens), len(ans_tokens), len(ans_opts_tokens)) assert all([ len(ans_opt) == 100 for ans_opt in ans_opts_tokens ]), "all the answer have 100 options" eval_lst.append((img_id, cap_tokens, ques_tokens, ans_tokens, ans_opts_tokens, gt_id)) img_idx += 1 return eval_lst def get_gt_rel_dict(fname): gt_rel_dict = {} gt_rel_data = json.load(open(fname)) for item in gt_rel_data: image_id = item['image_id'] round_id = item['round_id'] gt_relevance = item['gt_relevance'] # each image only at most has one turn having dense annotation if image_id not in gt_rel_dict: gt_rel_dict[image_id] = (round_id, gt_relevance) return gt_rel_dict if args.gt_rel_file != '': gt_rel_dict = get_gt_rel_dict(args.gt_rel_file) input_lines = read_data(args.src_file) next_i = 0 total_batch = math.ceil(len(input_lines) / args.batch_size) print('start the visdial decode evaluation...') t0 = time.time() ranks_json = [] sparse_metrics = SparseGTMetrics() ndcg = NDCG() with tqdm(total=total_batch) as pbar: while next_i < len(input_lines): _chunk = input_lines[next_i:next_i + args.batch_size] buf_id = [x[0] for x in _chunk] buf = [x[:-1] for x in _chunk] buf_gt_id = [x[-1] for x in _chunk] next_i += args.batch_size instances = [] for instance in buf: instances.append(bi_uni_pipeline[0](instance)) with torch.no_grad(): buf_gt_id = torch.tensor(buf_gt_id).long().to(device) batch_data = list(zip(*instances)) task_idx = torch.tensor(batch_data[-3], dtype=torch.long).to(device) if args.no_vision: conv_feats = [] vis_pe = [] else: img, vis_pe = (torch.stack(x).to(device) for x in batch_data[-2:]) conv_feats = img.data # Bx100x2048 vis_pe = vis_pe.data output_scores_turn = [] input_ids_turns = [[x[turn_i] for x in batch_data[0]] for turn_i in range(10)] segment_ids_turns = [[x[turn_i] for x in batch_data[1]] for turn_i in range(10)] input_mask_turns = [[x[turn_i] for x in batch_data[2]] for turn_i in range(10)] for turn_i in range(10): input_ids = torch.tensor(input_ids_turns[turn_i], dtype=torch.long).to(device) segment_ids = torch.tensor(segment_ids_turns[turn_i], dtype=torch.long).to(device) input_mask = torch.stack( input_mask_turns[turn_i]).to(device) output_scores = model(conv_feats, vis_pe, input_ids, segment_ids, input_mask, task_idx=task_idx) output_scores = output_scores[:, :, 1] # [batch_size, num_options] output_scores_turn.append(output_scores) output_scores_turn = torch.stack( output_scores_turn, 1) # [batch_size, num_rounds, num_options] ranks = scores_to_ranks(output_scores_turn) # output_scores_turn_cheat = output_scores_turn.scatter_(2, buf_gt_id.unsqueeze(2), 100.0) sparse_metrics.observe(output_scores_turn, buf_gt_id) for i in range(len(buf_id)): # Cast into types explicitly to ensure no errors in schema. # Round ids are 1-10, not 0-9 if args.split == "val": for j in range(10): ranks_json.append({ "image_id": buf_id[i], "round_id": int(j + 1), "ranks": [rank.item() for rank in ranks[i][j]], }) if args.gt_rel_file: scores = [] gt_rels = [] for i in range(len(buf_id)): if buf_id[i] in gt_rel_dict: turn_idx, gt_rel = gt_rel_dict[buf_id[i]] scores.append(output_scores_turn[i, turn_idx - 1, :]) gt_rels.append( torch.tensor( gt_rel, dtype=torch.float32).to(device)) scores = torch.stack(scores) gt_rels = torch.stack(gt_rels) ndcg.observe(scores, gt_rels) pbar.update(1) json.dump(ranks_json, open(args.save_ranks_path, "w")) logger.info("Finish writing rankings into %s" % (args.save_ranks_path)) if args.split == "val": fw = open(args.save_ranks_path.replace('.json', '_results.txt'), "w") all_metrics = {} all_metrics.update(sparse_metrics.retrieve(reset=True)) if args.gt_rel_file: all_metrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}") fw.write("%s: %.6f\n" % (metric_name, metric_value))
def main(config, model_times, myProcessor): if not os.path.exists(config.output_dir + model_times): os.makedirs(config.output_dir + model_times) if not os.path.exists(config.cache_dir + model_times): os.makedirs(config.cache_dir + model_times) output_model_file = os.path.join(config.output_dir, model_times, WEIGHTS_NAME) # 模型输出文件 output_config_file = os.path.join(config.output_dir, model_times, CONFIG_NAME) gpu_ids = [int(device_id) for device_id in config.gpu_ids.split()] device, n_gpu = get_device(gpu_ids[0]) # 设备准备 if n_gpu > 1: n_gpu = len(gpu_ids) config.train_batch_size = config.train_batch_size // config.gradient_accumulation_steps """ 设定随机种子 """ random.seed(config.seed) np.random.seed(config.seed) torch.manual_seed(config.seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.seed) """ 数据准备 """ processor = myProcessor() # 整个文件的代码只需要改此处即可 tokenizer = BertTokenizer.from_pretrained( config.bert_vocab_file, do_lower_case=config.do_lower_case) # 分词器选择 label_list = processor.get_labels() num_labels = len(label_list) if config.do_train: train_dataloader, train_examples_len = load_data( config.data_dir, tokenizer, processor, config.max_seq_length, config.train_batch_size, "train") dev_dataloader, _ = load_data(config.data_dir, tokenizer, processor, config.max_seq_length, config.dev_batch_size, "dev") num_train_optimization_steps = int( train_examples_len / config.train_batch_size / config.gradient_accumulation_steps) * config.num_train_epochs """ 模型准备 """ print("model name is {}".format(config.model_name)) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus.from_pretrained(config.bert_model_dir, cache_dir=config.cache_dir, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=gpu_ids) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=config.learning_rate, warmup=config.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) train(config.num_train_epochs, n_gpu, model, train_dataloader, dev_dataloader, optimizer, criterion, config.gradient_accumulation_steps, device, label_list, output_model_file, output_config_file, config.log_dir, config.print_step, config.early_stop) """ Test """ test_dataloader, _ = load_data(config.data_dir, tokenizer, processor, config.max_seq_length, config.test_batch_size, "test") bert_config = BertConfig(output_config_file) if config.model_name == "BertOrigin": from BertOrigin.BertOrigin import BertOrigin model = BertOrigin(bert_config, num_labels=num_labels) elif config.model_name == "BertCNN": from BertCNN.BertCNN import BertCNN filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNN(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) elif config.model_name == "BertATT": from BertATT.BertATT import BertATT model = BertATT(bert_config, num_labels=num_labels) elif config.model_name == "BertRCNN": from BertRCNN.BertRCNN import BertRCNN model = BertRCNN(bert_config, num_labels=num_labels) elif config.model_name == "BertCNNPlus": from BertCNNPlus.BertCNNPlus import BertCNNPlus filter_sizes = [int(val) for val in config.filter_sizes.split()] model = BertCNNPlus(bert_config, num_labels=num_labels, n_filters=config.filter_num, filter_sizes=filter_sizes) model.load_state_dict(torch.load(output_model_file)) model.to(device) """ 损失函数准备 """ criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) # test the model test_loss, test_acc, test_report, test_auc = evaluate( model, test_dataloader, criterion, device, label_list) print("-------------- Test -------------") print( f'\t Loss: {test_loss: .3f} | Acc: {test_acc*100: .3f} % | AUC:{test_auc}' ) for label in label_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score'])) print_list = ['macro avg', 'weighted avg'] for label in print_list: print('\t {}: Precision: {} | recall: {} | f1 score: {}'.format( label, test_report[label]['precision'], test_report[label]['recall'], test_report[label]['f1-score']))
def LoadDatasets(args, task_cfg, ids, split='trainval'): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) task_feature_reader1 = {} task_feature_reader2 = {} for i, task_id in enumerate(ids): task = 'TASK' + task_id if task_cfg[task]['features_h5path1'] not in task_feature_reader1: task_feature_reader1[task_cfg[task]['features_h5path1']] = None if task_cfg[task]['features_h5path2'] not in task_feature_reader2: task_feature_reader2[task_cfg[task]['features_h5path2']] = None # initilzie the feature reader for features_h5path in task_feature_reader1.keys(): if features_h5path != '': task_feature_reader1[features_h5path] = ImageFeaturesH5Reader( features_h5path, args.in_memory) for features_h5path in task_feature_reader2.keys(): if features_h5path != '': task_feature_reader2[features_h5path] = ImageFeaturesH5Reader( features_h5path, args.in_memory) task_datasets_train = {} task_datasets_val = {} task_dataloader_train = {} task_dataloader_val = {} task_ids = [] task_batch_size = {} task_num_iters = {} for i, task_id in enumerate(ids): task = 'TASK' + task_id task_ids.append(task) batch_size = task_cfg[task][ 'batch_size'] // args.gradient_accumulation_steps num_workers = args.num_workers if args.local_rank != -1: batch_size = int(batch_size / dist.get_world_size()) num_workers = int(num_workers / dist.get_world_size()) # num_workers = int(num_workers / len(ids)) logger.info("Loading %s Dataset with batch size %d" % (task_cfg[task]['name'], batch_size)) task_datasets_train[task] = None if 'train' in split: task_datasets_train[task] = DatasetMapTrain[task]( task=task_cfg[task]['name'], dataroot=task_cfg[task]['dataroot'], annotations_jsonpath=task_cfg[task] ['train_annotations_jsonpath'], split=task_cfg[task]['train_split'], image_features_reader=task_feature_reader1[ task_cfg[task]['features_h5path1']], gt_image_features_reader=task_feature_reader2[ task_cfg[task]['features_h5path2']], tokenizer=tokenizer, padding_index=0, max_seq_length=task_cfg[task]['max_seq_length'], max_region_num=task_cfg[task]['max_region_num'], ) task_datasets_val[task] = None if 'val' in split: task_datasets_val[task] = DatasetMapTrain[task]( task=task_cfg[task]['name'], dataroot=task_cfg[task]['dataroot'], annotations_jsonpath=task_cfg[task] ['val_annotations_jsonpath'], split=task_cfg[task]['val_split'], image_features_reader=task_feature_reader1[ task_cfg[task]['features_h5path1']], gt_image_features_reader=task_feature_reader2[ task_cfg[task]['features_h5path2']], tokenizer=tokenizer, padding_index=0, max_seq_length=task_cfg[task]['max_seq_length'], max_region_num=task_cfg[task]['max_region_num']) task_num_iters[task] = 0 task_batch_size[task] = 0 if 'train' in split: if args.local_rank == -1: train_sampler = RandomSampler(task_datasets_train[task]) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(task_datasets_train[task]) # num_workers = 1 task_dataloader_train[task] = DataLoader( task_datasets_train[task], sampler=train_sampler, # shuffle=False, batch_size=batch_size, num_workers=num_workers, pin_memory=True, ) task_num_iters[task] = len(task_dataloader_train[task]) task_batch_size[task] = batch_size if 'val' in split: task_dataloader_val[task] = DataLoader( task_datasets_val[task], shuffle=False, batch_size=batch_size, num_workers=num_workers, pin_memory=True, ) return task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val
def LoadDatasetEval(args, task_cfg, ids): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) task_feature_reader1 = {} task_feature_reader2 = {} for i, task_id in enumerate(ids): task = 'TASK' + task_id if task_cfg[task]['features_h5path1'] not in task_feature_reader1: task_feature_reader1[task_cfg[task]['features_h5path1']] = None if task_cfg[task]['features_h5path2'] not in task_feature_reader2: task_feature_reader2[task_cfg[task]['features_h5path2']] = None # initilzie the feature reader for features_h5path in task_feature_reader1.keys(): if features_h5path != '': task_feature_reader1[features_h5path] = ImageFeaturesH5Reader( features_h5path, args.in_memory) for features_h5path in task_feature_reader2.keys(): if features_h5path != '': task_feature_reader2[features_h5path] = ImageFeaturesH5Reader( features_h5path, args.in_memory) task_datasets_val = {} task_dataloader_val = {} task_ids = [] task_batch_size = {} task_num_iters = {} for i, task_id in enumerate(ids): task = 'TASK' + task_id task_ids.append(task) batch_size = args.batch_size if args.local_rank != -1: batch_size = int(batch_size / dist.get_world_size()) num_workers = int(args.num_workers / len(ids)) logger.info("Loading %s Dataset with batch size %d" % (task_cfg[task]['name'], batch_size)) if args.split: eval_split = args.split else: eval_split = task_cfg[task]['val_split'] task_datasets_val[task] = DatasetMapEval[task]( task=task_cfg[task]['name'], dataroot=task_cfg[task]['dataroot'], annotations_jsonpath=task_cfg[task]['val_annotations_jsonpath'], split=eval_split, image_features_reader=task_feature_reader1[task_cfg[task] ['features_h5path1']], gt_image_features_reader=task_feature_reader2[ task_cfg[task]['features_h5path2']], tokenizer=tokenizer, padding_index=0, max_seq_length=task_cfg[task]['max_seq_length'], max_region_num=task_cfg[task]['max_region_num']) task_dataloader_val[task] = DataLoader( task_datasets_val[task], shuffle=False, batch_size=batch_size, num_workers=num_workers, pin_memory=True, ) task_num_iters[task] = len(task_dataloader_val[task]) task_batch_size[task] = batch_size return task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val
def main(model_path): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="MRPC", type=str, help="The name of the task to train.") parser.add_argument("--testing_file", type=str) parser.add_argument("--predict_file", type=str) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=2, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=20.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() processors = {"mrpc": MrpcProcessor} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = "classification" label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] state_dict = torch.load(model_path, map_location=device) model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', state_dict=state_dict, cache_dir=cache_dir, num_labels=5) model.to(device)
# This is a simple heuristic which will always truncate the longer sequence # one token at a time. This makes more sense than truncating an equal percent # of tokens from each, since if one sequence is very short then each token # that's truncated likely contains more information than a longer sequence. while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) train_examples = getTrainData("GoogleDrive/My Drive/Data/Data_Train.xlsx", args['train_size']) train_features = convert_examples_to_features(train_examples, 512, BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)) def get_model(): if model_state_dict: model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels=num_labels, state_dict=model_state_dict) else: model = BertForSequenceClassification.from_pretrained(args['bert_model'], num_labels=num_labels) return model
def reset(self): # 加载语料库,这是pretrained Bert模型自带的 self.tokenizer = BertTokenizer(vocab_file=self.vocab_path) # 构建examples self.build_examples()
class CreateDataset(Dataset): def __init__(self, data_path, max_seq_len, vocab_path, example_type, seed): self.seed = seed self.max_seq_len = max_seq_len self.example_type = example_type self.data_path = data_path self.vocab_path = vocab_path self.reset() # 初始化 def reset(self): # 加载语料库,这是pretrained Bert模型自带的 self.tokenizer = BertTokenizer(vocab_file=self.vocab_path) # 构建examples self.build_examples() # 读取数据集 def read_data(self, quotechar=None): ''' 默认是以tab分割的数据 :param quotechar: :return: ''' lines = [] with open(self.data_path, 'r', encoding='utf-8') as fr: reader = csv.reader(fr, delimiter='\t', quotechar=quotechar) for line in reader: lines.append(line) return lines # 构建数据examples def build_examples(self): lines = self.read_data() self.examples = [] for i, line in enumerate(lines): guid = '%s-%d' % (self.example_type, i) label = line[0] text_a = line[1] example = InputExample(guid=guid, text_a=text_a, label=label) self.examples.append(example) del lines # 将example转化为feature def build_features(self, example): ''' # 对于两个句子: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # 对于单个句子: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # type_ids:表示是第一个句子还是第二个句子 ''' #转化为token tokens_a = self.tokenizer.tokenize(example.text_a) # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > self.max_seq_len - 2: tokens_a = tokens_a[:(self.max_seq_len - 2)] # 句子首尾加入标示符 tokens = ['[CLS]'] + tokens_a + ['[SEP]'] segment_ids = [0] * len(tokens) # 对应type_ids # 将词转化为语料库中对应的id input_ids = self.tokenizer.convert_tokens_to_ids(tokens) # 输入mask input_mask = [1] * len(input_ids) # padding,使用0进行填充 padding = [0] * (self.max_seq_len - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding # 标签 label_id = int(example.label) feature = InputFeature(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) return feature def _preprocess(self, index): example = self.examples[index] feature = self.build_features(example) return np.array(feature.input_ids),np.array(feature.input_mask),\ np.array(feature.segment_ids),np.array(feature.label_id) def __getitem__(self, index): return self._preprocess(index) def __len__(self): return len(self.examples)
saved_model_path = os.path.join(output_dir, "saved_models", job_name) os.makedirs(saved_model_path, exist_ok=True) else: saved_model_path = args.output_dir summary_writer = None # Prepare Summary Writer and saved_models path if check_write_log(): #azureml.tensorboard only streams from /logs directory, therefore hardcoded summary_writer = get_sample_writer(name=job_name, base='./logs') # Loading Tokenizer (vocabulary from blob storage, if exists) logger.info("Extracting the vocabulary") if args.tokenizer_path: logger.info(f'Loading tokenizer from {args.tokenizer_path}') tokenizer = BertTokenizer.from_pretrained(args.tokenizer_path, cache_dir=args.output_dir) else: tokenizer = BertTokenizer.from_pretrained( job_config.get_token_file_type(), cache_dir=args.output_dir) logger.info("Vocabulary contains {} tokens".format( len(list(tokenizer.vocab.keys())))) # Loading Model logger.info("Initializing BertMultiTask model") model = BertMultiTask(job_config=job_config, use_pretrain=use_pretrain, tokenizer=tokenizer, cache_dir=args.output_dir, device=device, write_log=check_write_log(), summary_writer=summary_writer)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument("--optim_recover_path", default=None, type=str, help="The file of pretraining optimizer.") # Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() args.output_dir = args.output_dir.replace( '[PT_OUTPUT_DIR]', os.getenv('PT_OUTPUT_DIR', '')) if args.local_rank == -1 or args.no_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int( args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open(os.path.join( args.output_dir, 'opt.json'), 'w'), sort_keys=True, indent=2) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) train_examples = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) amp_handle = None if args.fp16: from apex import amp amp_handle = amp.init(enable_caching=True) # Prepare model if (args.model_recover_path is None) or len(args.model_recover_path) == 0: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: if not Path(args.model_recover_path).exists(): logger.info("Path does not exist: {0}".format( args.model_recover_path)) sys.exit(0) logger.info( "***** Recover model: {0} *****".format(args.model_recover_path)) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=torch.load(args.model_recover_path), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) if args.do_train: t_total = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) else: t_total = 1 optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer( optimizer, static_loss_scale=args.loss_scale) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.task_name == 'sts-b': if args.fp16: lbl_type = torch.half else: lbl_type = torch.float else: lbl_type = torch.long # if all epoch checkpoints exist, skip the whole training process all_exist = True for i_epoch in range(1, int(args.num_train_epochs)+1): output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) if not Path(output_model_file).exists(): all_exist = False break global_step = 0 if args.do_train and (not all_exist): train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) all_input_ids = torch.tensor( [f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in train_features], dtype=lbl_type) train_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader( train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for i_epoch in trange(1, int(args.num_train_epochs)+1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') for step, batch in enumerate(iter_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() tr_loss += loss.item() iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) torch.save(model_to_save.state_dict(), output_model_file) # delete unused variables del optimizer #del model del param_optimizer del optimizer_grouped_parameters # Load a trained model that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): seg_result_dict = {} for i_epoch in trange(1, int(args.num_train_epochs)+1, desc="Epoch"): logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() del model output_model_file = os.path.join( args.output_dir, "model.{0}.bin".format(i_epoch)) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_set_list = [] for eval_segment in processor.get_dev_segments(): eval_examples = processor.get_dev_examples( args.data_dir, segment=eval_segment) eval_set_list.append((eval_segment, eval_examples)) break for eval_segment, eval_examples in eval_set_list: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation: %s *****", eval_segment) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=lbl_type) eval_data = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model( input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) result = {'eval_loss': eval_loss, 'eval_result': eval_result, 'model': output_model_file, 'model_recover_path': args.model_recover_path, 'task_name': args.task_name, 'epoch': i_epoch, 'eval_segment': eval_segment} if eval_segment not in seg_result_dict: seg_result_dict[eval_segment] = [] seg_result_dict[eval_segment].append(result) # logging the results logger.info( "***** Eval results ({0}: {1}) *****".format(eval_segment, i_epoch)) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) # dump predictions with open(os.path.join(args.output_dir, "{0}.{1}.pred".format(eval_segment, i_epoch)), "w") as f_out: for pred_it in processor.get_pred(all_logits): f_out.write(str(pred_it)) f_out.write('\n') for eval_segment, result_list in seg_result_dict.items(): with open(os.path.join(args.output_dir, eval_segment+".txt"), "w") as f_out: f_out.write(json.dumps(result_list, indent=2, sort_keys=True)) f_out.write('\n')
def train(args): args.train_batch_size=int(args.train_batch_size / args.gradient_accumulation_steps) tokenizer = BertTokenizer.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model] ) train_examples = data_utils.read_squad_examples(os.path.join(args.data_dir,"train.json"), is_training=True) num_train_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs train_features = data_utils.convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_segment_ids, all_input_mask, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) #>>>>> validation if args.do_valid: valid_examples=data_utils.read_squad_examples(os.path.join(args.data_dir,"dev.json"), is_training=True) valid_features = data_utils.convert_examples_to_features( valid_examples, tokenizer, args.max_seq_length, args.doc_stride, args.max_query_length, is_training=True) valid_all_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long) valid_all_segment_ids = torch.tensor([f.segment_ids for f in valid_features], dtype=torch.long) valid_all_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long) valid_all_start_positions = torch.tensor([f.start_position for f in valid_features], dtype=torch.long) valid_all_end_positions = torch.tensor([f.end_position for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_all_input_ids, valid_all_segment_ids, valid_all_input_mask, valid_all_start_positions, valid_all_end_positions) logger.info("***** Running validations *****") logger.info(" Num orig examples = %d", len(valid_examples)) logger.info(" Num split examples = %d", len(valid_features)) logger.info(" Batch size = %d", args.train_batch_size) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.train_batch_size) best_valid_loss=float('inf') valid_losses=[] #<<<<< end of validation declaration if not args.bert_model.endswith(".pt"): model = BertForQuestionAnswering.from_pretrained(modelconfig.MODEL_ARCHIVE_MAP[args.bert_model] ) else: model = torch.load(args.bert_model) if args.fp16: model.half() model.cuda() # Prepare optimizer param_optimizer = [(k, v) for k, v in model.named_parameters() if v.requires_grad==True] param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 model.train() for _ in range(args.num_train_epochs): for step, batch in enumerate(train_dataloader): batch = tuple(t.cuda() for t in batch) input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear(global_step/t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 #>>>> perform validation at the end of each epoch . if args.do_valid: model.eval() with torch.no_grad(): losses=[] valid_size=0 for step, batch in enumerate(valid_dataloader): batch = tuple(t.cuda() for t in batch) # multi-gpu does scattering it-self input_ids, segment_ids, input_mask, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) losses.append(loss.data.item()*input_ids.size(0) ) valid_size+=input_ids.size(0) valid_loss=sum(losses)/valid_size logger.info("validation loss: %f", valid_loss) valid_losses.append(valid_loss) if valid_loss<best_valid_loss: torch.save(model, os.path.join(args.output_dir, "model.pt") ) best_valid_loss=valid_loss model.train() if args.do_valid: with open(os.path.join(args.output_dir, "valid.json"), "w") as fw: json.dump({"valid_losses": valid_losses}, fw) else: torch.save(model, os.path.join(args.output_dir, "model.pt") )
tokens_a.pop() else: tokens_b.pop() def accuracy(out, labels): outputs = np.argmax(out, axis=1) return np.sum(outputs == labels) def warmup_linear(x, warmup=0.002): if x < warmup: return x/warmup return 1.0 - x device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") processor=SnliProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) model=BertForSequenceClassification.from_pretrained('bert-base-cased',num_labels = 3) train_examples = processor.get_train_examples('') model.to(device) train_features = convert_examples_to_features( train_examples, label_list, 128, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).to(device) all_input_mask = torch.tensor([f.input_mask for f in train_features]).to(device) all_segment_ids = torch.tensor([f.segment_ids for f in train_features]).to(device) all_label_ids = torch.tensor([f.label_id for f in train_features]).to(device) res=model(all_input_ids, all_segment_ids, all_input_mask, all_label_ids) print(res)
def __init__(self): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help= "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="None", type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() self.args = args self.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") self.processor = PersonanliProcessor() self.num_labels = 3 self.label_list = self.processor.get_labels() self.tokenizer = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) # Load a trained model that you have fine-tuned output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=self.num_labels) model.to(self.device) self.model = model
def run_aug(args, save_every_epoch=False): processors = { # you can your processor here "TREC": AugProcessor, "stsa.fine": AugProcessor, "stsa.binary": AugProcessor, "mpqa": AugProcessor, "rt-polarity": AugProcessor, "subj": AugProcessor, } task_name = args.task_name if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) args.data_dir = os.path.join(args.data_dir, task_name) parent_output_dir = args.output_dir args.output_dir = os.path.join(args.output_dir, task_name) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) processor = processors[task_name]() label_list = processor.get_labels(task_name) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None train_examples = processor.get_train_examples(args.data_dir) #dev_examples = processor.get_dev_examples(args.data_dir) #train_examples.extend(dev_examples) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) # Prepare model def load_model(model_name): weights_path = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, model_name) model = torch.load(weights_path) return model cbert_name = "{}/BertForMaskedLM_{}_epoch_{}".format( task_name.lower(), task_name.lower(), args.finetuned_epoch) model = load_model(cbert_name) model.cuda() # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total) global_step = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_init_ids = torch.tensor([f.init_ids for f in train_features], dtype=torch.long) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_masked_lm_labels = torch.tensor( [f.masked_lm_labels for f in train_features], dtype=torch.long) train_data = TensorDataset(all_init_ids, all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) save_model_dir = os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, task_name) if not os.path.exists(save_model_dir): os.mkdir(save_model_dir) MASK_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] origin_train_path = os.path.join(args.output_dir, "train_origin.tsv") save_train_path = os.path.join(args.output_dir, "train.tsv") shutil.copy(origin_train_path, save_train_path) #best_test_acc = train_text_classifier.train("aug_data") #print("before augment best acc:{}".format(best_test_acc)) for e in trange(int(args.num_train_epochs), desc="Epoch"): '''avg_loss = 0. for step, batch in enumerate(train_dataloader): model.train() batch = tuple(t.cuda() for t in batch) _, input_ids, input_mask, segment_ids, masked_ids = batch loss = model(input_ids, segment_ids, input_mask, masked_ids) loss.backward() avg_loss += loss.item() optimizer.step() model.zero_grad() if (step + 1) % 50 == 0: print("avg_loss: {}".format(avg_loss / 50)) avg_loss = 0''' torch.cuda.empty_cache() shutil.copy(origin_train_path, save_train_path) save_train_file = open(save_train_path, 'a') tsv_writer = csv.writer(save_train_file, delimiter='\t') #tsv_writer.writerow(['sentence', 'label']) for step, batch in enumerate(train_dataloader): model.eval() batch = tuple(t.cuda() for t in batch) init_ids, _, input_mask, segment_ids, _ = batch input_lens = [sum(mask).item() for mask in input_mask] #masked_idx = np.squeeze([np.random.randint(1, l-1, 1) for l in input_lens]) masked_idx = np.squeeze( [np.random.randint(0, l, max(l // 7, 2)) for l in input_lens]) original_ids = init_ids.clone() for ids, idx in zip(init_ids, masked_idx): ids[idx] = MASK_id predictions = model(init_ids, segment_ids, input_mask) for ids, idx, preds, seg, original_ids in zip( init_ids, masked_idx, predictions, segment_ids, original_ids): #pred = torch.argsort(pred)[:,-e-1][idx] original_str = tokenizer.convert_ids_to_tokens( original_ids.cpu().numpy()) original_str = rev_wordpiece(original_str) pred = torch.argsort(preds)[:, -1][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item(), original_str]) pred = torch.argsort(preds)[:, -2][idx] ids[idx] = pred new_str = tokenizer.convert_ids_to_tokens(ids.cpu().numpy()) new_str = rev_wordpiece(new_str) tsv_writer.writerow([new_str, seg[0].item(), original_str]) torch.cuda.empty_cache() predictions = predictions.detach().cpu() torch.cuda.empty_cache() bak_train_path = os.path.join(args.output_dir, "train_epoch_{}.tsv".format(e)) shutil.copy(save_train_path, bak_train_path) best_test_acc = train_text_classifier.train_with_default_args( parent_output_dir, args.task_name) print("epoch {} augment best acc:{}".format(e, best_test_acc)) if save_every_epoch: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path) else: if (e + 1) % 10 == 0: save_model_name = "BertForMaskedLM_" + task_name + "_epoch_" + str( e + 1) save_model_path = os.path.join(save_model_dir, save_model_name) torch.save(model, save_model_path)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument('--amp', type=str, default="", help="Apex AMP") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.amp: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.amp) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.amp: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir)
def main(): """Main method of this module.""" parser = argparse.ArgumentParser() parser.add_argument("-c", "--inputFile", default=None, type=str, required=True, help="The input data dir") parser.add_argument("-o", "--outputFile", default=None, type=str, help="Output file for predictions") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default="emw", type=str, help="The name of the task to train.") parser.add_argument("--model_load", default="", type=str, required=True, help="The path of model state.") parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--batch_size", default=16, type=int, help="Batch size.") args = parser.parse_args() processors = { "hyperpartisan": HyperProcessor, "emw": EmwProcessor, "emw2": EmwProcessor2, } bert_model = args.bert_model max_seq_length = args.max_seq_length model_path = args.model_load batch_size = args.batch_size task_name = args.task_name.lower() processor = processors[task_name]() label_list = processor.get_labels() inputFile = args.inputFile outputFile = args.outputFile num_labels = len(label_list) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForSequenceClassification.from_pretrained( bert_model, PYTORCH_PRETRAINED_BERT_CACHE, num_labels=num_labels) try: model.load_state_dict( torch.load(model_path)) # , map_location='cpu' for only cpu except: #When model is parallel model = torch.nn.DataParallel(model) model.load_state_dict( torch.load(model_path)) # , map_location='cpu' for only cpu logger.info("Model state has been loaded.") model.to(device) test_examples = processor.get_test_examples(inputFile) random.shuffle(test_examples) test_dataloader = DataLoader(dataset=HyperpartisanData( test_examples, label_list, max_seq_length, tokenizer), batch_size=batch_size) df = pd.read_csv(inputFile) df["prediction"] = 0 model.eval() for input_ids, input_mask, segment_ids, label_ids, doc_ids in test_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() labels = np.argmax(logits, axis=1) for i in range(len(labels)): df.iloc[int(doc_ids[i].item()), df.columns.get_loc("prediction")] = int(labels[i]) df.to_csv(outputFile, index=False) logger.info("The predictions have been written to the output folder.")
def __init__(self, word2id, gram2id, labelmap, hpara, args): super().__init__() self.spec = locals() self.spec.pop("self") self.spec.pop("__class__") self.spec.pop('args') self.word2id = word2id self.gram2id = gram2id self.labelmap = labelmap self.hpara = hpara self.num_labels = len(self.labelmap) + 1 self.max_seq_length = self.hpara['max_seq_length'] self.max_ngram_size = self.hpara['max_ngram_size'] self.max_ngram_length = self.hpara['max_ngram_length'] self.bert_tokenizer = None self.bert = None self.zen_tokenizer = None self.zen = None self.zen_ngram_dict = None if self.hpara['use_bert']: if args.do_train: cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) self.bert_tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=self.hpara['do_lower_case']) self.bert = BertModel.from_pretrained(args.bert_model, cache_dir=cache_dir) self.hpara['bert_tokenizer'] = self.bert_tokenizer self.hpara['config'] = self.bert.config else: self.bert_tokenizer = self.hpara['bert_tokenizer'] self.bert = BertModel(self.hpara['config']) hidden_size = self.bert.config.hidden_size self.dropout = nn.Dropout(self.bert.config.hidden_dropout_prob) elif self.hpara['use_zen']: if args.do_train: cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(zen.PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) self.zen_tokenizer = zen.BertTokenizer.from_pretrained(args.bert_model, do_lower_case=self.hpara['do_lower_case']) self.zen_ngram_dict = zen.ZenNgramDict(args.bert_model, tokenizer=self.zen_tokenizer) self.zen = zen.modeling.ZenModel.from_pretrained(args.bert_model, cache_dir=cache_dir) self.hpara['zen_tokenizer'] = self.zen_tokenizer self.hpara['zen_ngram_dict'] = self.zen_ngram_dict self.hpara['config'] = self.zen.config else: self.zen_tokenizer = self.hpara['zen_tokenizer'] self.zen_ngram_dict = self.hpara['zen_ngram_dict'] self.zen = zen.modeling.ZenModel(self.hpara['config']) hidden_size = self.zen.config.hidden_size self.dropout = nn.Dropout(self.zen.config.hidden_dropout_prob) else: raise ValueError() if self.hpara['use_memory']: self.kv_memory = WordKVMN(hidden_size, len(gram2id)) else: self.kv_memory = None self.classifier = nn.Linear(hidden_size, self.num_labels, bias=False) if self.hpara['decoder'] == 'crf': self.crf = CRF(tagset_size=self.num_labels - 3, gpu=True) else: self.crf = None if args.do_train: self.spec['hpara'] = self.hpara
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default="", type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--model_file", default="", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese or any pretrained model directory with model.bin and config file" ) parser.add_argument( "--bert_model", default="", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default="", type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default="", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--num_parts_start", default=-1, type=int, required=True, help="Number of partitions to run train and test on") parser.add_argument("--num_parts_end", default=-1, type=int, required=True, help="Number of partitions to run train and test on") parser.add_argument("--task_num", default=-1, type=int, required=True, help="Number of partitions to run train and test on") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "clinicalhedges": InputProcessor, } num_labels_task = { "clinicalhedges": [2, 2, 2, 2, 2], } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() task_num = args.task_num if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() print(processor) num_labels = num_labels_task[task_name][task_num - 1] print(num_labels) label_list = processor.get_labels(task_num - 1) print(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) file = open( os.path.join(args.output_dir, "Classification_Reports_Task_{}.txt".format(task_num)), 'w') for part_index in range(args.num_parts_start, args.num_parts_end): train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples( args.data_dir, part_index, task_num) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.model_file, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info( "***** Running training on Part {} Task {}*****".format( part_index, task_num)) logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 eval_examples = processor.get_dev_examples( args.data_dir, part_index, task_num) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) print("\n") print("Running evaluation for epoch: {}".format(ep)) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } for key in sorted(result.keys()): print(key, str(result[key])) print() if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self if (os.path.exists( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num)))): shutil.rmtree( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num))) os.mkdir( os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num))) output_model_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_eval: # Load a trained model and config that you have fine-tuned output_model_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, "Model_Part_{}_Task_{}".format(part_index, task_num), CONFIG_NAME) config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict( torch.load(output_model_file, map_location='cpu')) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_test_examples( args.data_dir, part_index, task_num) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) complete_user_ids = list() for example in eval_examples: complete_user_ids.append(example.guid) logger.info("***** Running Test for Part {} Task {}*****".format( part_index, task_num)) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 complete_label_ids = list() complete_outputs = list() complete_probs = list() for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) last_layer_op = copy.deepcopy(logits) logits = logits.detach().cpu().numpy() sm = torch.nn.Softmax() probabilities = sm(last_layer_op) probabilities = probabilities.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) outputs = np.argmax(logits, axis=1) complete_outputs.extend(outputs) complete_label_ids.extend(label_ids) complete_probs.extend(probabilities[:, 1]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 outcsv = open(os.path.join( args.output_dir, "Reqd_Labels_Part_{}_Task_{}.csv".format(part_index, task_num)), 'w', encoding='utf8', newline='') writer = csv.writer(outcsv, quotechar='"') writer.writerow(["ID", "True", "Pred", "Prob"]) for user, true, pred, prob in zip(complete_user_ids, complete_label_ids, complete_outputs, complete_probs): writer.writerow([user, true, pred, prob]) outcsv.close() eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) file.write( "\nClassification Report Part- {}\n\n".format(part_index) + classification_report(complete_label_ids, complete_outputs) + "\n\n\n") file.close()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # if args.server_ip and args.server_port: # # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script # import ptvsd # print("Waiting for debugger attach") # ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) # ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, "qqp": QqpProcessor, "qnli": QnliProcessor, "rte": RteProcessor, "wnli": WnliProcessor, } output_modes = { "cola": "classification", "mnli": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", "qqp": "classification", "qnli": "classification", "rte": "classification", "wnli": "classification", } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() #[0,1] num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: # train_examples = processor.get_train_examples_wenpeng('/home/wyin3/Datasets/glue_data/RTE/train.tsv') train_examples = processor.get_combined_train_examples_wenpeng( '/home/wyin3/Datasets/MNLI-SNLI-SciTail-RTE-SICK/all.5.train.txt', 'SNLI') num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) iter_co = 0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 if iter_co % 1000 == 0: ''' start evaluate on test set after this epoch ''' model.eval() eval_examples = processor.get_test_examples_wenpeng( '/home/wyin3/Datasets/RTE/test_RTE_1235.txt') eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct( logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss / nb_tr_steps if args.do_train else None test_acc = result.get("acc") if test_acc > max_test_acc: max_test_acc = test_acc print('test acc:', test_acc, ' max_test_acc:', max_test_acc)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
# Create an outputs/ folder in the blob storage parent_dir = os.path.join(path, 'outputs', str(run.experiment.name)) output_dir = os.path.join(parent_dir, str(run.id)) os.makedirs(output_dir, exist_ok=True) saved_model_path = os.path.join(output_dir, "saved_models", job_name) summary_writer = None # Prepare Summary Writer and saved_models path if check_write_log(): #azureml.tensorboard only streams from /logs directory, therefore hardcoded summary_writer = get_sample_writer(name=job_name, base='./logs') os.makedirs(saved_model_path, exist_ok=True) # Loading Tokenizer (vocabulary from blob storage, if exists) logger.info("Extracting the vocabulary") tokenizer = BertTokenizer.from_pretrained(job_config.get_token_file_type(), cache_dir=path) logger.info("Vocabulary contains {} tokens".format( len(list(tokenizer.vocab.keys())))) # Loading Model logger.info("Initializing BertMultiTask model") model = BertMultiTask(job_config=job_config, use_pretrain=use_pretrain, tokenizer=tokenizer, cache_dir=path, device=device, write_log=check_write_log(), summary_writer=summary_writer) logger.info("Converting the input parameters") if fp16:
def main(): parser = ArgumentParser() parser.add_argument('--train_corpus', type=Path, required=True) parser.add_argument("--output_dir", type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True) # , # choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", # "bert-base-multilingual", "bert-base-chinese"]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Reduce memory usage for large datasets by keeping data on disc rather than in memory" ) parser.add_argument("--epochs_to_generate", type=int, default=3, help="Number of epochs of data to pregenerate") parser.add_argument("--max_seq_len", type=int, default=128) parser.add_argument( "--short_seq_prob", type=float, default=0.1, help="Probability of making a short sentence as a training example") parser.add_argument( "--masked_lm_prob", type=float, default=0.15, help="Probability of masking each token for the LM task") parser.add_argument( "--max_predictions_per_seq", type=int, default=20, help="Maximum number of tokens to mask in each sequence") args = parser.parse_args() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with DocumentDatabase(reduce_memory=args.reduce_memory) as docs: with args.train_corpus.open() as f: doc = [] for line in tqdm(f, desc="Loading Dataset", unit=" lines"): line = line.strip() if line == "": docs.add_document(doc) doc = [] else: tokens = tokenizer.tokenize(line) doc.append(tokens) if doc: docs.add_document( doc ) # If the last doc didn't end on a newline, make sure it still gets added if len(docs) <= 1: exit( "ERROR: No document breaks were found in the input file! These are necessary to allow the script to " "ensure that random NextSentences are not sampled from the same document. Please add blank lines to " "indicate breaks between documents in your input file. If your dataset does not contain multiple " "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, " "sections or paragraphs.") args.output_dir.mkdir(exist_ok=True) for epoch in trange(args.epochs_to_generate, desc="Epoch"): epoch_filename = args.output_dir / f"epoch_{epoch}.json" num_instances = 0 with epoch_filename.open('w') as epoch_file: for doc_idx in trange(len(docs), desc="Document"): doc_instances = create_instances_from_document( docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, vocab_list=vocab_list) doc_instances = [ json.dumps(instance) for instance in doc_instances ] for instance in doc_instances: epoch_file.write(instance + '\n') num_instances += 1 metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json" with metrics_file.open('w') as metrics_file: metrics = { "num_training_examples": num_instances, "max_seq_len": args.max_seq_len } metrics_file.write(json.dumps(metrics))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(args.local_rank)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): if param.grad is not None: param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_predict: eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .csv files (or other data files) for the task." ) parser.add_argument( "--output_sentvec_file", default=None, type=str, required=True, help="The output file of extracted embedding files of sentences.") parser.add_argument( "--data_split_to_extract", default=None, type=str, required=True, help="The output file of extracted embedding files of sentences.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--epoch_id", default=0, type=int, help="Epoch id to extract.") parser.add_argument( "--save_model_name", default="model", type=str, required=True, help= "The output model name where the model checkpoints will be written.") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--with_dev", action='store_true', help="Whether to run training with dev.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test on the test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--layer_id", default=-1, type=int, help="Output Layer Id") parser.add_argument("--mlp_hidden_dim", default=64, type=int, help="mlp_hidden_dim.") parser.add_argument("--mlp_dropout", default=0.1, type=float, help="hidden drop out") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--patience', type=int, default=5, help="early stop epoch nums on dev") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() print("torch.cuda.is_available()", torch.cuda.is_available()) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_csqa_examples( os.path.join(args.data_dir, 'train_rand_split.jsonl')) dev_examples = read_csqa_examples( os.path.join(args.data_dir, 'dev_rand_split.jsonl')) print(len(train_examples)) if args.with_dev: train_examples += dev_examples print(len(train_examples)) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForMultipleChoiceExtraction.from_pretrained( args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)), num_choices=5, mlp_hidden_dim=args.mlp_hidden_dim, mlp_dropout=args.mlp_dropout) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Load a trained model and config that you have fine-tuned output_model_file = os.path.join( args.output_dir, args.save_model_name + ".bin.%d" % (args.epoch_id)) output_config_file = os.path.join(args.output_dir, args.save_model_name + ".config") config = BertConfig(output_config_file) model = BertForMultipleChoiceExtraction( config, num_choices=5, mlp_hidden_dim=args.mlp_hidden_dim, mlp_dropout=args.mlp_dropout) model.load_state_dict(torch.load(output_model_file)) model.to(device) # to extract dev_rand_split.jsonl 'dev_rand_split.jsonl' eval_examples = read_csqa_examples( os.path.join(args.data_dir, args.data_split_to_extract)) eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length, True) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 pooled_sent_vecs = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Iteration"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): # tmp_eval_loss, pooled_output = model(input_ids, segment_ids, input_mask, label_ids) logits, pooled_output = model(input_ids, segment_ids, input_mask, layer_id=args.layer_id) pooled_sent_vecs.append(pooled_output) # print(pooled_output.size()) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 pooled_sent_vecs = torch.cat(pooled_sent_vecs, dim=0) print(pooled_sent_vecs.size()) output_numpy = pooled_sent_vecs.to('cpu').numpy() print(output_numpy.shape) np.save(args.output_sentvec_file + ".%d" % (args.layer_id), output_numpy) eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_accuracy': eval_accuracy} logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_model", default="/home/ryuto/data/jap_BERT/", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--vocab", default="/home/ryuto/data/NTC_Matsu_original/wordIndex.txt", type=str) # model parameters parser.add_argument( "--do_lower_case", action='store_true', help= "Set this flag if you are using an uncased model. (If Japanese model, set false)" ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) # Data Augmentation Option parser.add_argument('--data_ratio', type=float, default=100, help="full size = 100 (default=100)") parser.add_argument("--token_strategy", dest='how_select', default="argmax", type=str, help="Choose from 'argmax' or 'sample'") parser.add_argument( '--predicate', action='store_true', help="If True, target word is replaced even if it is predicate.") # Hyper parameter parser.add_argument('--seed', type=int, default=2020) parser.add_argument('--replace_max', type=int, default=5) parser.add_argument('--replace_min', type=int, default=3) parser.add_argument('--n_sample', type=int, default=3) args = parser.parse_args() # Seed random.seed(args.seed) # vocab & tokenizer vocab = set_vocab(args.vocab) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # Create MASK instances instances = create_masked_instances(args) # Create dataset features = convert_instances_to_features(instances=instances, seq_length=args.max_seq_length, tokenizer=tokenizer) # model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForMaskedLM.from_pretrained(args.bert_model) model.to(device) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for feature in tqdm(features): feature.send_to_device(device) instance = prediction(model=model, feature=feature, tokenizer=tokenizer, how_select=args.how_select) instance = convert_bert_predicts_to_ids(instance=instance, vocab=vocab) print(json.dumps(instance), file=writer)