def load_and_predict(data_dir, model_type, pretrain_model): if model_type == 'bert_japanese': model = BertForQuestionAnswering.from_pretrained( 'cl-tohoku/bert-base-japanese') tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese') if model_type == 'bert_multilingual': model = BertForQuestionAnswering.from_pretrained( 'bert-base-multilingual-cased') tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', tokenize_chinese_chars=False) if model_type == 'albert': model = AlbertForQuestionAnswering.from_pretrained( 'ALINEAR/albert-japanese-v2') tokenizer = AlbertTokenizer.from_pretrained( 'ALINEAR/albert-japanese-v2') test_data = TestData(data_dir, TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn) model = model.to(device) model.load_state_dict(torch.load(pretrain_model)) prediction = predict(model, testloader, device, tokenizer) prediction = func(data_dir, prediction) print('finish loading and predicting from {}!'.format(pretrain_model)) return prediction #prediction dictionary
def task_3(): # 任务三:句子预测任务 question, text = "里昂是谁", "里昂是一个杀手" sample = (question, text) tokenizer = BertTokenizer.from_pretrained(bert_path) sen_code = tokenizer.batch_encode_plus( [sample]) # 上下句结合可以这样传参 List[Tuple[str, str]] tokens_tensor = torch.tensor(sen_code["input_ids"]) segments_tensor = torch.tensor(sen_code["token_type_ids"]) model_config = BertConfig.from_pretrained(bert_path) # model_config.num_labels = 2 # 最终有两个输出,初始位置和结束位置 # model = BertForQuestionAnswering.from_pretrained(bert_path) # 这是一种加载方式 model = BertForQuestionAnswering(model_config) # 这是另一种加载方式 model.eval() outputs = model(tokens_tensor, segments_tensor) start_pos, end_pos = outputs.start_logits, outputs.end_logits for idx, (start, end) in enumerate( zip(start_pos.argmax(axis=1), end_pos.argmax(axis=1))): all_tokens = tokenizer.convert_ids_to_tokens( sen_code["input_ids"][idx]) # 进行逆编码,得到原始的token print( all_tokens ) # ['[CLS]', '里', '昂', '是', '谁', '[SEP]', '里', '昂', '是', '一', '个', '杀', '手', '[SEP]'] if start <= end: answer = " ".join(all_tokens[start:end + 1]) # 对输出的答案进行解码的过程 # 每次执行的结果不一致,这里因为没有经过微调,所以效果不是很好,输出结果不佳,下面的输出是其中的一种。 print(answer) # 一 个 杀 手 [SEP] else: print("预测的有问题哦!")
def __init__(self, args): print("Loading BERT configs...") with open("bert_config.json") as f: config_json = json.load(f) config = BertConfig( attention_probs_dropout_prob=config_json[ "attention_probs_dropout_prob"], hidden_act=config_json["hidden_act"], hidden_dropout_prob=config_json["hidden_dropout_prob"], hidden_size=config_json["hidden_size"], initializer_range=config_json["initializer_range"], intermediate_size=config_json["intermediate_size"], max_position_embeddings=config_json["max_position_embeddings"], num_attention_heads=config_json["num_attention_heads"], num_hidden_layers=config_json["num_hidden_layers"], type_vocab_size=config_json["type_vocab_size"], vocab_size=config_json["vocab_size"]) print("Loading PyTorch model...") self.model = BertForQuestionAnswering(config) self.model.eval() self.model.cuda() self.model.load_state_dict( torch.load( "build/data/bert_tf_v1_1_large_fp32_384_v2/model.pytorch")) print("Constructing SUT...") self.sut = lg.ConstructSUT(self.issue_queries, self.flush_queries, self.process_latencies) print("Finished constructing SUT.") self.qsl = get_squad_QSL(args.max_examples)
def load_from_tf(config, tf_path): model = BertForQuestionAnswering(config) model.classifier = model.qa_outputs # This part is copied from HuggingFace Transformers with a fix to bypass an error init_vars = tf.train.list_variables(tf_path) names = [] arrays = [] for name, shape in init_vars: # print("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) arrays.append(array) for name, array in zip(names, arrays): name = name.split("/") # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model for m_name in name: if re.fullmatch(r"[A-Za-z]+_\d+", m_name): scope_names = re.split(r"_(\d+)", m_name) else: scope_names = [m_name] if scope_names[0] == "kernel" or scope_names[0] == "gamma": pointer = getattr(pointer, "weight") elif scope_names[0] == "output_bias" or scope_names[0] == "beta": pointer = getattr(pointer, "bias") elif scope_names[0] == "output_weights": pointer = getattr(pointer, "weight") elif scope_names[0] == "squad": pointer = getattr( pointer, "classifier") # This line is causing the issue else: try: pointer = getattr(pointer, scope_names[0]) except AttributeError: print("Skipping {}".format("/".join(name))) continue if len(scope_names) >= 2: num = int(scope_names[1]) pointer = pointer[num] if m_name[-11:] == "_embeddings": pointer = getattr(pointer, "weight") elif m_name == "kernel": array = np.transpose(array) try: assert pointer.shape == array.shape except AssertionError as e: e.args += (pointer.shape, array.shape) raise print("Initialize PyTorch weight {}".format(name)) pointer.data = torch.from_numpy(array) model.qa_outputs = model.classifier del model.classifier return model
def __init__(self, bert_dir, args): super(BERTPretrainedMRC, self).__init__() if args.load_pretrainedBERT: self.bert = BertForQuestionAnswering.from_pretrained(bert_dir) else: self.bert_config = BertQueryNerConfig.from_pretrained( bert_dir, hidden_dropout_prob=args.bert_dropout, attention_probs_dropout_prob=args.bert_dropout, mrc_dropout=args.mrc_dropout) self.bert = BertForQuestionAnswering(self.bert_config)
def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): self.model = BertForQuestionAnswering.from_pretrained( self.pretrained_bert, output_attentions=False, output_hidden_states=False) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = BertConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = BertForQuestionAnswering(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: logger.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): logger.info(f"Load path {weights_path} exists.") logger.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved logger.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: logger.info( f"Init from scratch. Load path {weights_path} does not exist." )
def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: # We load a sequence classification model first -- again, as a workaround. Refactor. try: model = AutoModelForSequenceClassification.from_pretrained(options.model_name) except OSError: model = AutoModelForSequenceClassification.from_pretrained(options.model_name, from_tf=True) fixed_model = BertForQuestionAnswering(model.config) fixed_model.qa_outputs = model.classifier fixed_model.bert = model.bert device = torch.device(options.device) model = fixed_model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name, do_lower_case=options.do_lower_case) return QuestionAnsweringTransformerReranker(model, tokenizer)
def model_fn(model_dir): config_path = model_dir + '/config_file.json' model_path = model_dir + '/pytorch_model.bin' config = BertConfig.from_json_file(config_path) model = BertForQuestionAnswering(config) # Checks GPU state model.load_state_dict( torch.load(model_path, map_location=torch.device( 'cuda' if torch.cuda.is_available() else 'cpu'))) return model
def main(): # Set seed seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Setup CUDA, GPU & distributed training device = torch.device("cuda") config = BertConfig.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,) #the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base. model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,) model.to(device) max_seq_length=384 train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0] # Training global_step, ave_loss = train(train_dataset, model, tokenizer) print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) # Save the trained model and the tokenizer output_dir = 'output/' # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) print("Saving model checkpoint to %s", output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(output_dir) tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True) model.to(device) # Evaluate results = evaluate(model, tokenizer) print("Results: {}".format(results)) return result
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path:str, bert_config_file:str, pytorch_dump_path:str)->None: """ Updated function to convert a Tensorflow checkpoint to compatible model. """ # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForQuestionAnswering(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def nlpQuestion(question): print("nlptriggered") text = """ Coronaviruses are a large family of viruses that can cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). COVID-19 is a virus of the same family with a first recorded outbreak in Wuhan, China, in December 2019. The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Other symptoms include aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin gradually. Some people become infected but don’t develop any symptoms and don't feel unwell. Most people (about 80%) recover from the disease without needing special treatment. Around 1 out of every 6 people who gets COVID-19 becomes seriously ill and develops difficulty breathing. Older people, and those with underlying medical problems like high blood pressure, heart problems or diabetes, are more likely to develop serious illness. People with fever, cough and difficulty breathing should seek medical attention. People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. These droplets land on objects and surfaces around the person. Other people then catch COVID-19 by touching these objects or surfaces, then touching their eyes, nose or mouth. People can also catch COVID-19 if they breathe in droplets from a person with COVID-19 who coughs out or exhales droplets. This is why it is important to stay more than 1 meter (3 feet) away from a person who is sick. Studies to date suggest that the virus that causes COVID-19 is mainly transmitted through contact with respiratory droplets rather than through the air. There have been 105000 confirmed cases of coronovirus in the world, with 3100 deaths. There are only 32 confirmed cases in Lebanon. If you are experiencing symptoms, call MOPH on 1214 or 76592699. """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') print("nlptriggered") input_ids = tokenizer.encode(question, text) token_type_ids = [ 0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids)) ] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor( [token_type_ids])) print("nlptriggered") all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join( all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1]).replace(' ##', '') print(answer) return answer
def download_model(): if (not Path("model_downloaded").is_file() or not Path("usecase_indicator.h5").is_file()): url = "https://b0ykepubbucket.s3-eu-west-1.amazonaws.com/usecase_indicator.h5" r = requests.get(url, stream=True) chunk_progress = 0 with open("usecase_indicator.h5", "wb") as modelfile: for chunk in r.iter_content(chunk_size=8388608): if chunk: modelfile.write(chunk) chunk_progress += 1 print( f"Downloading model 1/2 in background: {chunk_progress*8}MB" ) sys.stdout.flush() else: open("model_downloaded", "w").close() if (not Path("modelqna_downloaded").is_file() or not Path("./BertLSquad/pytorch_model.bin").is_file()): print(f"Started model 2/2 download in background") sys.stdout.flush() model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') model.save_pretrained("./BertLSquad") open("modelqna_downloaded", "w").close() print("Model 2/2 download completed") sys.stdout.flush() return
def configure_tokenizer_model_bert(args, logger, is_preprocess=False): logger.info("***** Loading tokenizer *****") tokenizer = BertTokenizer.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, do_lower_case=args.do_lower_case) # logger.info("Loading configuration from {}".format(args.cache_dir)) logger.info("***** Loading configuration from {} ******".format( args.init_dir)) config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.init_dir) config.vocab_size = len(tokenizer.vocab) logger.info("***** Loading pretrained model from {} *****".format( args.init_dir)) if is_preprocess: model = AutoModel.from_pretrained(args.model_name_or_path, config=config, cache_dir=args.init_dir) else: model = BertForQuestionAnswering.from_pretrained( args.init_dir, config=config, cache_dir=args.init_dir) return tokenizer, model
def answergen_bert(context, question): tokenizer = BertTokenizer.from_pretrained( 'csarron/bert-base-uncased-squad-v1') model = BertForQuestionAnswering.from_pretrained( 'csarron/bert-base-uncased-squad-v1') #tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',return_token_type_ids = True) #model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') encoding = tokenizer.encode_plus(question, context) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])) ans_tokens = input_ids[torch.argmax(start_scores[ 0, 1:]):torch.argmax(end_scores[0, 1:]) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens) print("\nQuestion ", question) #print ("\nAnswer Tokens: ") #print (answer_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) print("\nAnswer : ", answer_tokens_to_string) return answer_tokens_to_string
def __init__(self,model_configs): self.model_configs=model_configs self.pretrained_model = BertForQuestionAnswering.from_pretrained(self.model_configs['pretrained_model_name'], cache_dir=self.model_configs['cache_dir'],output_attentions=True) self.tokenizer = BertTokenizer.from_pretrained(self.model_configs['tokenizer_name'])
def create_graphics(self, url_base, model_card_path): pruned_heads = self.checkpoint_info["config"].get("pruned_heads") ret = {} if pruned_heads is not None: pruning_info_plotter = PruningInfoBokehPlotter( "pruning_info", self.JS_PATH) fig, js, html = pruning_info_plotter.run(layer_count=12, pruned_heads=pruned_heads, heads_count=12) ret["pruning_info"] = dict(js=js, html=html) density_plotter = DensityBokehPlotter("density", self.JS_PATH) model = BertForQuestionAnswering.from_pretrained(self.git_path) fig, js, html = density_plotter.run(model=model, dest_path=model_card_path / "images", url_base=url_base + "/images") ret["density_info"] = dict(js=js, html=html) from bokeh.io import export_png export_png(fig, filename="/tmp/plot.png") return ret
def load_model(model_path): model = BertForQuestionAnswering.from_pretrained(model_path) model.to(device) model.eval() model.zero_grad() return model
def __init__(self): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.model_name = "nyust-eb210/braslab-bert-drcd-384" self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name) self.model = BertForQuestionAnswering.from_pretrained(self.model_name).to( self.device )
def load_qa_model(): model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') #Tokenizer tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') return model, tokenizer
def train(): with msg.loading(" Loading BERT"): TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased') MODEL = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') msg.good(" BERT loaded") articles_dir = os.path.join(SCRIPT_PATH, '../data/raw/CORD-19-research-challenge/') articles_folders = [ 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/', 'comm_use_subset/comm_use_subset/pdf_json/', 'comm_use_subset/comm_use_subset/pmc_json/', 'noncomm_use_subset/noncomm_use_subset/pdf_json/', 'noncomm_use_subset/noncomm_use_subset/pmc_json/', 'custom_license/custom_license/pdf_json/', 'custom_license/custom_license/pmc_json/' ] meta_path = articles_dir + 'metadata.csv' with msg.loading(" Loading publications"): start = time.time() data_text, index2paperID, index2paperPath = get_data_texts( articles_dir, articles_folders, meta_path) msg.good(" Publications loaded - Took {:.2f}s".format(time.time() - start)) covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath) covid_q.fit(data_text) return covid_q
def _get_question_answering(self): """ Initializes the BertForQuestionAnswering transformer NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. """ self.qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') self.qa.eval()
def test_patch_module_ampere(self): config = BertConfig.from_pretrained("bert-base-uncased") model = BertForQuestionAnswering(config) parameters = LinearPruningArgs( method="topK", submethod="default", ampere_method="annealing", block_rows=32, block_cols=32, min_elements=0.005, ) context = PatcherContext() p = LinearPruningModulePatcher(context, parameters, self.MODEL_STRUCTURE) module_patchers = dict(query=p, key=p, value=p, att_dense=p, interm_dense=p, output_dense=p) patcher = LinearModelPatcher(module_patchers, self.MODEL_STRUCTURE) patcher.patch(model) self.assertEqual(patcher.stats["patched"], 72) key_sizes = {k: len(v) for k, v in context.context_modules.items()} self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 72})
def test_patch_module_tied_attention(self): config = BertConfig.from_pretrained("bert-base-uncased") model = BertForQuestionAnswering(config) parameters = LinearPruningParameters( method="topK", submethod="default", ampere_method="annealing", block_rows=32, block_cols=32, ) context = PatcherContext() p_attention = JointPruningModulePatcher(context, parameters, "attention") p_dense = LinearPruningModulePatcher(context, parameters) module_patchers = dict( query=p_attention, key=p_attention, value=p_attention, att_dense=p_dense, interm_dense=p_dense, output_dense=p_dense, ) patcher = BertLinearModelPatcher(module_patchers) patcher.patch(model) self.assertEqual(patcher.stats["patched"], 72) key_sizes = {k: len(v) for k, v in context.context_modules.items()} self.assertEqual(key_sizes, {"ampere_mask": 72, "mask": 48})
def __init__(self, qa_path, relations_filepath, data_directory, batch_size, must_choose_answer, device, trained_to_reject, calculate_single_error=True): self.trained_to_reject = trained_to_reject self.qa_path = qa_path # path to qa weights self.relations_filepath = relations_filepath # path to relations file self.data_directory = data_directory # data directory path self.tokenizer = BertTokenizer.from_pretrained( 'bert-large-cased') # tokenizer self.model = BertForQuestionAnswering.from_pretrained( qa_path) # Load the model self.model.to(device) self.device = device self.batch_size = batch_size self.must_choose_answer = must_choose_answer # For datasets where there is always an answer, setting this to true will ensure that QA models that can return "answer doesn't exist" will always return a span in the context self.total_samples = 0 if calculate_single_error: self.se_list = [] else: self.se_list = None
def load_model(self, model_path: str, do_lower_case=False): config = BertConfig.from_pretrained(model_path + "/bert_config.json") tokenizer = BertTokenizer.from_pretrained( model_path, do_lower_case=do_lower_case) model = BertForQuestionAnswering.from_pretrained( model_path, from_tf=False, config=config) return model, tokenizer
def get_answer_using_bert(question, reference_text): # Load fine-tuned model for QA bert_model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') # Load Vocab as well bert_tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') # Apply bert_tokenizer on input text input_ids = bert_tokenizer.encode(question, reference_text) input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids) # Search index of first [SEP] token sep_location = input_ids.index(bert_tokenizer.sep_token_id) first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - ( sep_location + 1) seg_embedding = [0] * first_seg_len + [1] * second_seg_len # Run our example on model model_scores = bert_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([seg_embedding])) ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax( model_scores[1]) result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1]) # Return final result result = result.replace(' ##', '') return result
def load_model(self): config = BertConfig.from_pretrained(self.c_path) self.model = BertForQuestionAnswering.from_pretrained( 'bert-base-uncased', config=config) self.model.to(self.device) self.model.eval() return self.model
async def main(message: types.Message): import torch model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') Question = 'The purpose of the NewsQA dataset' paragrah = 'With massive volumes of written text being produced every second, how do we make sure that we have the most recent and relevant information available to us? Microsoft research Montreal is tackling this problem by building AI systems that can read and comprehend large volumes of complex text in real-time. The purpose of the NewsQA dataset is to help the research community build algorithms that are capable of answering questions requiring human-level comprehension and reasoning skills.' encoding = tokenizer.encode_plus(text=Question, text_pair=paragrah, add_special=True) # token embedding inputs = encoding['input_ids'] #3 segment embedgin sentence_embed = encoding['token_type_ids'] # input tokens tokens = tokenizer.convert_ids_to_tokens(inputs) start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor( [sentence_embed]), return_dict=False) start_index = torch.argmax(start_scores) end_index = torch.argmax(end_scores) answer = ' '.join(tokens[start_index:end_index + 1]) await message.reply(text=answer)
def train(args): model = BertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_PATH) tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_PATH) model.resize_token_embeddings(len(tokenizer)) datasets = CMRC2018(args=args, tokenizer=tokenizer)() training_args = TrainingArguments( output_dir=args.model_path, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, warmup_steps=args.warmup_steps, remove_unused_columns=False, logging_dir=args.log_path, num_train_epochs=args.n_epochs, dataloader_num_workers=args.num_workers, evaluation_strategy='epoch') print( f"Train dataset size: {len(datasets['train'])}, Validation dataset size: {len(datasets['validation'])}" ) trainer = Trainer(model=model, args=training_args, train_dataset=datasets['train'], eval_dataset=datasets['validation']) trainer.train() trainer.save_model() pass
def answer_question(question, answer_text, model_name=None, tokenizer_name=None): """ Takes a `question` string and an `answer_text` string (which contains the answer), and identifies the words within the `answer_text` that are the answer. Parameters ---------- question : str answer_text : str model : str tokenizer : str Return ------- answer : str """ # ======== Model & Tokenizer (default: bert-large finetuned squad ver.1)======== if model_name is None: model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad' if tokenizer_name is None: tokenizer_name = 'bert-large-uncased-whole-word-masking-finetuned-squad' model = BertForQuestionAnswering.from_pretrained(model_name) tokenizer = BertTokenizer.from_pretrained(tokenizer_name) # ======== Tokenize ======== input_ids = tokenizer.encode(question, answer_text) # Report how long the input sequence is. # print(f"Query has {len(input_ids):,} tokens.\n") # ======== Set Segment IDs ======== sep_index = input_ids.index(tokenizer.sep_token_id) num_seg_a = sep_index + 1 num_seg_b = len(input_ids) - num_seg_a segment_ids = [0]*num_seg_a + [1]*num_seg_b # There should be a segment_id for every input token. assert len(segment_ids) == len(input_ids) # ======== Evaluate ======== start_scores, end_scores = model( torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) ) # ======== Reconstruct Answer ======== answer_start = torch.argmax(start_scores) answer_end = torch.argmax(end_scores) tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = tokens[answer_start] for i in range(answer_start + 1, answer_end + 1): if tokens[i][0:2] == '##': answer += tokens[i][2:] else: answer += ' ' + tokens[i] return answer