def load_bert(model_path="bert/model/pytorch_model.bin", config_file="bert/config_parameters/config.json"): print("Loading BERT-model...") config = BertConfig(config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device("cpu"))) print("Model loaded.\n\n") return model
def test_BertForQuestionAnswering(): input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) model = BertForQuestionAnswering(config) print(model(input_ids, token_type_ids, input_mask))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model, cache_dir=os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank))) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 if args.do_train: cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used and handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step/num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(output_model_file)) else: model = BertForQuestionAnswering.from_pretrained(args.bert_model) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph model_path = args.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() ### Loading Pretrained model for QnA print("Loading BERT-model...\n\n") config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device("cpu"))) model.to(device) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) while True: print("Please specify paragraph: \n " "1: Assisted Time Holdover \n " "2: Semcon short version \n " "3: Semcon long version") choice = input() if choice == "1": break elif choice == "2": para_file = "bert/input/semcon_short.txt" break elif choice == "3": para_file = "bert/input/semcon.txt" break else: print("I did not understand that, please type in 1, 2 or 3. \n") ### Reading paragraph f = open(para_file, "r") para = f.read() f.close() print("\nParagraph:\n", para) while True: input_data = [] paragraphs = {} paragraphs["id"] = 1 # paragraphs["text"] = splits[0].replace("Paragraph:", "").strip("\n") paragraphs["text"] = para paragraphs["ques"] = [input("\n What is your question?\n")] if paragraphs["ques"] == ["exit"]: exit() start = time.time() input_data.append(paragraphs) ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, ) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in pred_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult( unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, )) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) prediction = colored( predictions[math.floor(examples[0].unique_id / 12)][examples[0]], "green", attrs=["reverse"], ) print(prediction, "\n") print("Time: ", time.time() - start) """
def answer_prediction(paras,question,model,config_file,max_seq_length=384,doc_stride=128,max_query_length=64,max_answer_length=60): #para_file = 'Input_file.txt' model_path = model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() ### Raeding paragraph ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() ## input_data is a list of dictionary which has a paragraph and questions #para_list = para.split('\n\n') #print(paras) input_data = [] i = 1 for i,para in enumerate(paras): # print(para) paragraphs = {} #splits = para.split('\nQuestions:') paragraphs['id'] = i paragraphs['text'] = para paragraphs['ques']= question input_data.append(paragraphs) examples = read_paragraphs(input_data,question) tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', do_lower_case=True) eval_features = convert_examples_to_features( examples = examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path,map_location='cpu')) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=10) predictions = [] for input_ids, input_mask, segment_ids, example_indices in tqdm(pred_dataloader): model.eval() input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) features=[] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results,max_answer_length) predictions.append(output) ### For printing the results #### final_preds = [] final_paras = [] final_probs = [] final_scores = [] final_ques = [] index = None for i,example in enumerate(examples): if index!= example.example_id: index = example.example_id # ques_text = colored(example.question_text, 'blue') prediction = predictions[math.floor(example.unique_id/12)][example] prob = predictions[math.floor(example.unique_id/12)]['prob'+str(example)] final_ques.append(ques_text) final_preds.append(prediction) final_paras.append(example.para_text) final_probs.append(prob) return final_ques,final_preds,final_paras,final_probs
def start(): app = Flask(__name__) host = "0.0.0.0" port = 8000 debug = True parser = argparse.ArgumentParser() parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) config = BertConfig("./output/config.json") model = BertForQuestionAnswering(config) model.load_state_dict( torch.load("./output/pytorch_model.bin", map_location='cpu')) model.to(device) @app.route('/', methods=['POST']) def filter(): dat_in = { "index": 2, "original_sentence": "existing image captioning models do not generalize well to out-of-domain images containing novel scenes or objects . this limitation severely hinders the use of these models in real world applications dealing with images in the wild . we address this problem using a flexible approach that enables existing deep captioning architectures to take advantage of image taggers at test time , without re-training . our method uses constrained beam search to force the inclusion of selected tag words in the output , and fixed , pretrained word embeddings to facilitate vocabulary expansion to previously unseen tag words . using this approach we achieve state of the art results for out-of-domain captioning on mscoco -LRB- and improved results for in-domain captioning -RRB- . perhaps surprisingly , our results significantly outperform approaches that incorporate the same tag predictions into the learning algorithm . we also show that we can significantly improve the quality of generated imagenet captions by leveraging ground-truth labels . ", "tagged_sentence": "existing│O_ANS image│O_ANS captioning│O_ANS models│O_ANS do│O_ANS not│O_ANS generalize│O_ANS well│O_ANS to│O_ANS out-of-domain│O_ANS images│O_ANS containing│O_ANS novel│O_ANS scenes│O_ANS or│O_ANS objects│O_ANS .│O_ANS this│O_ANS limitation│O_ANS severely│O_ANS hinders│O_ANS the│O_ANS use│O_ANS of│O_ANS these│O_ANS models│O_ANS in│O_ANS real│O_ANS world│O_ANS applications│O_ANS dealing│O_ANS with│O_ANS images│O_ANS in│O_ANS the│O_ANS wild│O_ANS .│O_ANS we│O_ANS address│O_ANS this│O_ANS problem│O_ANS using│O_ANS a│O_ANS flexible│O_ANS approach│O_ANS that│O_ANS enables│O_ANS existing│O_ANS deep│O_ANS captioning│O_ANS architectures│O_ANS to│O_ANS take│O_ANS advantage│O_ANS of│O_ANS image│O_ANS taggers│O_ANS at│O_ANS test│O_ANS time│O_ANS ,│O_ANS without│O_ANS re-training│O_ANS .│O_ANS our│O_ANS method│O_ANS uses│O_ANS constrained│O_ANS beam│O_ANS search│O_ANS to│O_ANS force│O_ANS the│O_ANS inclusion│O_ANS of│O_ANS selected│O_ANS tag│O_ANS words│O_ANS in│O_ANS the│O_ANS output│O_ANS ,│O_ANS and│O_ANS fixed│O_ANS ,│O_ANS pretrained│O_ANS word│B_ANS embeddings│I_ANS to│O_ANS facilitate│O_ANS vocabulary│O_ANS expansion│O_ANS to│O_ANS previously│O_ANS unseen│O_ANS tag│O_ANS words│O_ANS .│O_ANS using│O_ANS this│O_ANS approach│O_ANS we│O_ANS achieve│O_ANS state│O_ANS of│O_ANS the│O_ANS art│O_ANS results│O_ANS for│O_ANS out-of-domain│O_ANS captioning│O_ANS on│O_ANS mscoco│O_ANS -LRB-│O_ANS and│O_ANS improved│O_ANS results│O_ANS for│O_ANS in-domain│O_ANS captioning│O_ANS -RRB-│O_ANS .│O_ANS perhaps│O_ANS surprisingly│O_ANS ,│O_ANS our│O_ANS results│O_ANS significantly│O_ANS outperform│O_ANS approaches│O_ANS that│O_ANS incorporate│O_ANS the│O_ANS same│O_ANS tag│O_ANS predictions│O_ANS into│O_ANS the│O_ANS learning│O_ANS algorithm│O_ANS .│O_ANS we│O_ANS also│O_ANS show│O_ANS that│O_ANS we│O_ANS can│O_ANS significantly│O_ANS improve│O_ANS the│O_ANS quality│O_ANS of│O_ANS generated│O_ANS imagenet│O_ANS captions│O_ANS by│O_ANS leveraging│O_ANS ground-truth│O_ANS labels│O_ANS .│O_ANS ", "answer": "word embeddings", "question": [ "What does pretrained stand for ?", "What is pretrained ?", "What does re-training stand for ?", "What is the pretrained ?", "What is the term for pretrained ?" ], "score": [ -2.3564553260803223, -3.8269970417022705, -4.229936122894287, -5.298074722290039, -5.689377307891846 ] } eval_examples = read_squad_examples(input_data=dat_in, is_training=False, version_2_with_negative=True) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=args.max_query_length, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) result = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, args.verbose_logging, True, args.null_score_diff_threshold) # inputs = request.get_json(force=True) return result app.run(debug=debug, host=host, port=port, use_reloader=False, threaded=True)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument("--model", default=None, type=str) parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% " "of training.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument("--config_file", default=None, type=str) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(args.model, map_location='cpu')) model.to(device) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph model_path = args.model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(device) n_gpu = torch.cuda.device_count() ### Raeding paragraph f = open(para_file, 'r') para = f.read() f.close() ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() para_list = para.split('\n\n') input_data = [] i = 1 for para in para_list: paragraphs = {} splits = para.split('\nQuestions:') paragraphs['id'] = i paragraphs['text'] = splits[0].replace('Paragraph:', '').strip('\n') paragraphs['ques'] = splits[1].lstrip('\n').split('\n') input_data.append(paragraphs) i += 1 ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in tqdm( pred_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) ### For printing the results #### index = None for example in examples: if index != example.example_id: print(example.para_text) index = example.example_id print('\n') print( colored('***********Question and Answers *************', 'red')) ques_text = colored(example.question_text, 'blue') print(ques_text) prediction = colored(predictions[math.floor(example.unique_id / 12)][example], 'green', attrs=['reverse', 'blink']) print(prediction) print('\n')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--paragraph", default=None, type=str) parser.add_argument("--question", default=None, type=str) parser.add_argument("--model", default=None, type=str) parser.add_argument("--max_seq_length", default=384, type=int) parser.add_argument("--doc_stride", default=128, type=int) parser.add_argument("--max_query_length", default=64, type=int) parser.add_argument("--config_file", default=None, type=str) parser.add_argument("--max_answer_length", default=30, type=int) args = parser.parse_args() para_file = args.paragraph question_file = args.question model_path = args.model device = torch.device("cpu") ### Raeding paragraph # f = open(para_file, 'r') # para = f.read() # f.close() ## Reading question # f = open(ques_file, 'r') # ques = f.read() # f.close() # para_list = para.split('\n\n') f = open(para_file, "rb") para = f.read() para = para.decode('windows-1252') para = para.strip("\n").replace("\r", " ").replace("\n", "") #print(para) # print(para) f.close() f_ = open(question_file, "r") question = f_.read() question = question.split("\n") while "" in question: question.remove("") for q in question: q = q.strip("\n") f_.close() input_data = [] pfinder = ParaFinder(para) i = 0 for q in question: closest_para = pfinder.closestParagraph(q) paragraphs = {} paragraphs["id"] = i paragraphs["text"] = closest_para paragraphs["ques"] = [q] i += 1 input_data.append(paragraphs) # print(input_data) ## input_data is a list of dictionary which has a paragraph and questions examples = read_squad_examples(input_data) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) eval_features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) ### Loading Pretrained model for QnA config = BertConfig(args.config_file) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=9) predictions = [] for input_ids, input_mask, segment_ids, example_indices in pred_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) features = [] example = [] all_results = [] for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() feature = eval_features[example_index.item()] unique_id = int(feature.unique_id) features.append(feature) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output = predict(examples, features, all_results, args.max_answer_length) predictions.append(output) ### For printing the results #### index = None for example in examples: if index != example.example_id: # print(example.para_text) index = example.example_id # print('\n') # print(colored('***********Question and Answers *************', 'red')) ques_text = example.question_text print(ques_text) prediction, prob = predictions[math.floor(example.unique_id / 12)][example] if prob > 0.35: print(prediction) #print(type(prediction)) else: print("No result found")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--bert_token_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_dir", default=None, type=str, required=True, help="학습된 모델이 저장되어 있는 path") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters parser.add_argument("--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument("--verbose_logging", action='store_true', help="If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--do_lower_case", action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--version_2_with_negative', action='store_true', help='If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() #python run_triviaqa.py --bert_token_model bert-base-uncased --model_dir bert_triviaQA/ --output_dir result/ --predict_file dev-wiki-triviaqa_m.json --no_cuda --do_lower_case --predict_batch_size 40 print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory () already exists and is not empty.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_token_model, do_lower_case=args.do_lower_case) # Load Pretrained Model config_path = os.path.join(args.model_dir, CONFIG_NAME) model_path = os.path.join(args.model_dir, WEIGHTS_NAME) config = BertConfig(config_path) model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location='cpu')) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold)
def __init__(self): # Hyperparameters self.BERT_MODEL = "bert-base-uncased" self.OUTPUT_DIR = "bert-model" self.TRAIN_FILE = "" self.PREDICT_FILE = "squad/test-pred.json" self.MAX_SEQ_LENGTH = 384 self.DOC_STRIDE = 128 self.MAX_QUERY_LENGTH = 64 self.DO_TRAIN = False self.DO_PREDICT = True self.TRAIN_BATCH_SIZE = 12 self.PREDICT_BATCH_SIZE = 8 self.LEARNING_RATE = 3e-5 self.NUM_TRAIN_EPOCHS = 2.0 self.WARMUP_PROPORTION = 0.1 self.N_BEST_SIZE = 20 self.MAX_ANSWER_LENGTH = 30 self.VERBOSE_LOGGING = False self.NO_CUDA = False self.SEED = 42 self.GRADIENT_ACCUMULATION_STEPS = 1 self.DO_LOWER_CASE = True self.LOCAL_RANK = -1 self.FP16 = False self.LOSS_SCALE = 0 self.VERSION_2_WITH_NEGATIVE = True self.NULL_SCORE_DIFF_THRESHOLD = 0.0 if self.LOCAL_RANK == -1 or self.NO_CUDA: self.device = torch.device("cuda" if torch.cuda.is_available() and not self.NO_CUDA else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(self.LOCAL_RANK) self.device = torch.device("cuda", self.LOCAL_RANK) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(self.device, n_gpu, bool(self.LOCAL_RANK != -1), self.FP16)) if self.GRADIENT_ACCUMULATION_STEPS < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(self.GRADIENT_ACCUMULATION_STEPS)) self.TRAIN_BATCH_SIZE = self.TRAIN_BATCH_SIZE // self.GRADIENT_ACCUMULATION_STEPS random.seed(self.SEED) np.random.seed(self.SEED) torch.manual_seed(self.SEED) if n_gpu > 0: torch.cuda.manual_seed_all(self.SEED) if not self.DO_TRAIN and not self.DO_PREDICT: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if self.DO_TRAIN: if not self.TRAIN_FILE: raise ValueError( "If `do_train` is True, then `train_file` must be specified." ) if self.DO_PREDICT: if not self.PREDICT_FILE: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(self.OUTPUT_DIR) and os.listdir( self.OUTPUT_DIR) and self.DO_TRAIN: raise ValueError( "Output directory () already exists and is not empty.") if not os.path.exists(self.OUTPUT_DIR): os.makedirs(self.OUTPUT_DIR) self.tokenizer = BertTokenizer.from_pretrained( self.BERT_MODEL, do_lower_case=self.DO_LOWER_CASE) train_examples = None num_train_optimization_steps = None # Prepare model self.model = BertForQuestionAnswering.from_pretrained( self.BERT_MODEL, cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(self.LOCAL_RANK))) if self.FP16: self.model.half() self.model.to(self.device) if self.LOCAL_RANK != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) self.model = DDP(self.model) elif n_gpu > 1: self.model = torch.nn.DataParallel(self.model) # Prepare optimizer param_optimizer = list(self.model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if self.FP16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, bias_correction=False, max_grad_norm=1.0) if self.LOSS_SCALE == 0: optimizer = self.FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = self.FP16_Optimizer( optimizer, static_loss_scale=self.LOSS_SCALE) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=self.LEARNING_RATE, warmup=self.WARMUP_PROPORTION, t_total=num_train_optimization_steps) # self.model = BertForQuestionAnswering.from_pretrained(self.BERT_MODEL) output_model_file = os.path.join(self.OUTPUT_DIR, WEIGHTS_NAME) output_config_file = os.path.join(self.OUTPUT_DIR, CONFIG_NAME) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) self.model = BertForQuestionAnswering(config) if torch.cuda.is_available(): self.model.load_state_dict(torch.load(output_model_file)) else: self.model.load_state_dict( torch.load(output_model_file, map_location='cpu')) self.model.to(self.device) print('\n*** QA MODULE READY [1/3] ***\n')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") ## Other parameters parser.add_argument("--pretrained_squad_model", default=None, type=str) parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help = "local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features( examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature if args.pretrained_squad_model: input_config_file = os.path.join(args.pretrained_squad_model, CONFIG_NAME) input_model_file = os.path.join(args.pretrained_squad_model, WEIGHTS_NAME) config = BertConfig(input_config_file) qa_model = BertForQuestionAnswering(config) qa_model.load_state_dict(torch.load(input_model_file, map_location=device)) model = qa_model.bert # The model we will use for extracting else: model = BertModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
n_gpu = torch.cuda.device_count() RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) # para_file = "../Input_file.txt" para_file = "/content/drive/My Drive/train-v2.0.json" # TODO: use proper file path model_path = "/content/drive/My Drive/pytorch_model.bin" # TODO: use proper file path tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) ### Loading Pretrained model for QnA config = BertConfig("../Results/bert_config.json") model = BertForQuestionAnswering(config) model.load_state_dict(torch.load(model_path, map_location='cpu')) # model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') model.to(device) print() ### initializing the autoencoder hidden_size = 384 encoder1 = EncoderRNN(384, config.hidden_size, hidden_size).to(device) decoder1 = DecoderRNN(384, config.hidden_size, hidden_size).to(device) encoder_optimizer = optim.Adam(encoder1.parameters()) decoder_optimizer = optim.Adam(decoder1.parameters()) criterion = nn.MSELoss() pp = pprint.PrettyPrinter(indent=4)