def load_and_predict(data_dir, model_type, pretrain_model): if model_type == 'bert_japanese': model = BertForQuestionAnswering.from_pretrained( 'cl-tohoku/bert-base-japanese') tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese') if model_type == 'bert_multilingual': model = BertForQuestionAnswering.from_pretrained( 'bert-base-multilingual-cased') tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', tokenize_chinese_chars=False) if model_type == 'albert': model = AlbertForQuestionAnswering.from_pretrained( 'ALINEAR/albert-japanese-v2') tokenizer = AlbertTokenizer.from_pretrained( 'ALINEAR/albert-japanese-v2') test_data = TestData(data_dir, TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn) model = model.to(device) model.load_state_dict(torch.load(pretrain_model)) prediction = predict(model, testloader, device, tokenizer) prediction = func(data_dir, prediction) print('finish loading and predicting from {}!'.format(pretrain_model)) return prediction #prediction dictionary
def main(): # Set seed seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Setup CUDA, GPU & distributed training device = torch.device("cuda") config = BertConfig.from_pretrained('bert-base-cased') tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,) #the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base. model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,) model.to(device) max_seq_length=384 train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0] # Training global_step, ave_loss = train(train_dataset, model, tokenizer) print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step) # Save the trained model and the tokenizer output_dir = 'output/' # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) print("Saving model checkpoint to %s", output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForQuestionAnswering.from_pretrained(output_dir) tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True) model.to(device) # Evaluate results = evaluate(model, tokenizer) print("Results: {}".format(results)) return result
def load_qa_model(): model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') #Tokenizer tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') return model, tokenizer
def download_model(): if (not Path("model_downloaded").is_file() or not Path("usecase_indicator.h5").is_file()): url = "https://b0ykepubbucket.s3-eu-west-1.amazonaws.com/usecase_indicator.h5" r = requests.get(url, stream=True) chunk_progress = 0 with open("usecase_indicator.h5", "wb") as modelfile: for chunk in r.iter_content(chunk_size=8388608): if chunk: modelfile.write(chunk) chunk_progress += 1 print( f"Downloading model 1/2 in background: {chunk_progress*8}MB" ) sys.stdout.flush() else: open("model_downloaded", "w").close() if (not Path("modelqna_downloaded").is_file() or not Path("./BertLSquad/pytorch_model.bin").is_file()): print(f"Started model 2/2 download in background") sys.stdout.flush() model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') model.save_pretrained("./BertLSquad") open("modelqna_downloaded", "w").close() print("Model 2/2 download completed") sys.stdout.flush() return
def train(): with msg.loading(" Loading BERT"): TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased') MODEL = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') msg.good(" BERT loaded") articles_dir = os.path.join(SCRIPT_PATH, '../data/raw/CORD-19-research-challenge/') articles_folders = [ 'biorxiv_medrxiv/biorxiv_medrxiv/pdf_json/', 'comm_use_subset/comm_use_subset/pdf_json/', 'comm_use_subset/comm_use_subset/pmc_json/', 'noncomm_use_subset/noncomm_use_subset/pdf_json/', 'noncomm_use_subset/noncomm_use_subset/pmc_json/', 'custom_license/custom_license/pdf_json/', 'custom_license/custom_license/pmc_json/' ] meta_path = articles_dir + 'metadata.csv' with msg.loading(" Loading publications"): start = time.time() data_text, index2paperID, index2paperPath = get_data_texts( articles_dir, articles_folders, meta_path) msg.good(" Publications loaded - Took {:.2f}s".format(time.time() - start)) covid_q = QuestionCovid(TOKENIZER, MODEL, index2paperID, index2paperPath) covid_q.fit(data_text) return covid_q
def _get_question_answering(self): """ Initializes the BertForQuestionAnswering transformer NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. """ self.qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') self.qa.eval()
def __init__(self): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.model_name = "nyust-eb210/braslab-bert-drcd-384" self.tokenizer = BertTokenizerFast.from_pretrained(self.model_name) self.model = BertForQuestionAnswering.from_pretrained(self.model_name).to( self.device )
def answergen_bert(context, question): tokenizer = BertTokenizer.from_pretrained( 'csarron/bert-base-uncased-squad-v1') model = BertForQuestionAnswering.from_pretrained( 'csarron/bert-base-uncased-squad-v1') #tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',return_token_type_ids = True) #model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') encoding = tokenizer.encode_plus(question, context) input_ids, attention_mask = encoding["input_ids"], encoding[ "attention_mask"] start_scores, end_scores = model(torch.tensor([input_ids]), attention_mask=torch.tensor( [attention_mask])) ans_tokens = input_ids[torch.argmax(start_scores[ 0, 1:]):torch.argmax(end_scores[0, 1:]) + 1] answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens) print("\nQuestion ", question) #print ("\nAnswer Tokens: ") #print (answer_tokens) answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens) print("\nAnswer : ", answer_tokens_to_string) return answer_tokens_to_string
def get_answer_using_bert(question, reference_text): # Load fine-tuned model for QA bert_model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') # Load Vocab as well bert_tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') # Apply bert_tokenizer on input text input_ids = bert_tokenizer.encode(question, reference_text) input_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids) # Search index of first [SEP] token sep_location = input_ids.index(bert_tokenizer.sep_token_id) first_seg_len, second_seg_len = sep_location + 1, len(input_ids) - ( sep_location + 1) seg_embedding = [0] * first_seg_len + [1] * second_seg_len # Run our example on model model_scores = bert_model(torch.tensor([input_ids]), token_type_ids=torch.tensor([seg_embedding])) ans_start_loc, ans_end_loc = torch.argmax(model_scores[0]), torch.argmax( model_scores[1]) result = ' '.join(input_tokens[ans_start_loc:ans_end_loc + 1]) # Return final result result = result.replace(' ##', '') return result
def __init__(self, qa_path, relations_filepath, data_directory, batch_size, must_choose_answer, device, trained_to_reject, calculate_single_error=True): self.trained_to_reject = trained_to_reject self.qa_path = qa_path # path to qa weights self.relations_filepath = relations_filepath # path to relations file self.data_directory = data_directory # data directory path self.tokenizer = BertTokenizer.from_pretrained( 'bert-large-cased') # tokenizer self.model = BertForQuestionAnswering.from_pretrained( qa_path) # Load the model self.model.to(device) self.device = device self.batch_size = batch_size self.must_choose_answer = must_choose_answer # For datasets where there is always an answer, setting this to true will ensure that QA models that can return "answer doesn't exist" will always return a span in the context self.total_samples = 0 if calculate_single_error: self.se_list = [] else: self.se_list = None
def load_model(model_path): model = BertForQuestionAnswering.from_pretrained(model_path) model.to(device) model.eval() model.zero_grad() return model
def __init__(self,model_configs): self.model_configs=model_configs self.pretrained_model = BertForQuestionAnswering.from_pretrained(self.model_configs['pretrained_model_name'], cache_dir=self.model_configs['cache_dir'],output_attentions=True) self.tokenizer = BertTokenizer.from_pretrained(self.model_configs['tokenizer_name'])
async def main(message: types.Message): import torch model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') Question = 'The purpose of the NewsQA dataset' paragrah = 'With massive volumes of written text being produced every second, how do we make sure that we have the most recent and relevant information available to us? Microsoft research Montreal is tackling this problem by building AI systems that can read and comprehend large volumes of complex text in real-time. The purpose of the NewsQA dataset is to help the research community build algorithms that are capable of answering questions requiring human-level comprehension and reasoning skills.' encoding = tokenizer.encode_plus(text=Question, text_pair=paragrah, add_special=True) # token embedding inputs = encoding['input_ids'] #3 segment embedgin sentence_embed = encoding['token_type_ids'] # input tokens tokens = tokenizer.convert_ids_to_tokens(inputs) start_scores, end_scores = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor( [sentence_embed]), return_dict=False) start_index = torch.argmax(start_scores) end_index = torch.argmax(end_scores) answer = ' '.join(tokens[start_index:end_index + 1]) await message.reply(text=answer)
def train(args): model = BertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_PATH) tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_PATH) model.resize_token_embeddings(len(tokenizer)) datasets = CMRC2018(args=args, tokenizer=tokenizer)() training_args = TrainingArguments( output_dir=args.model_path, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, warmup_steps=args.warmup_steps, remove_unused_columns=False, logging_dir=args.log_path, num_train_epochs=args.n_epochs, dataloader_num_workers=args.num_workers, evaluation_strategy='epoch') print( f"Train dataset size: {len(datasets['train'])}, Validation dataset size: {len(datasets['validation'])}" ) trainer = Trainer(model=model, args=training_args, train_dataset=datasets['train'], eval_dataset=datasets['validation']) trainer.train() trainer.save_model() pass
def load_model(self, model_path: str, do_lower_case=False): config = BertConfig.from_pretrained(model_path + "/bert_config.json") tokenizer = BertTokenizer.from_pretrained( model_path, do_lower_case=do_lower_case) model = BertForQuestionAnswering.from_pretrained( model_path, from_tf=False, config=config) return model, tokenizer
def nlpQuestion(question): print("nlptriggered") text = """ Coronaviruses are a large family of viruses that can cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). COVID-19 is a virus of the same family with a first recorded outbreak in Wuhan, China, in December 2019. The most common symptoms of COVID-19 are fever, tiredness, and dry cough. Other symptoms include aches and pains, nasal congestion, runny nose, sore throat or diarrhea. These symptoms are usually mild and begin gradually. Some people become infected but don’t develop any symptoms and don't feel unwell. Most people (about 80%) recover from the disease without needing special treatment. Around 1 out of every 6 people who gets COVID-19 becomes seriously ill and develops difficulty breathing. Older people, and those with underlying medical problems like high blood pressure, heart problems or diabetes, are more likely to develop serious illness. People with fever, cough and difficulty breathing should seek medical attention. People can catch COVID-19 from others who have the virus. The disease can spread from person to person through small droplets from the nose or mouth which are spread when a person with COVID-19 coughs or exhales. These droplets land on objects and surfaces around the person. Other people then catch COVID-19 by touching these objects or surfaces, then touching their eyes, nose or mouth. People can also catch COVID-19 if they breathe in droplets from a person with COVID-19 who coughs out or exhales droplets. This is why it is important to stay more than 1 meter (3 feet) away from a person who is sick. Studies to date suggest that the virus that causes COVID-19 is mainly transmitted through contact with respiratory droplets rather than through the air. There have been 105000 confirmed cases of coronovirus in the world, with 3100 deaths. There are only 32 confirmed cases in Lebanon. If you are experiencing symptoms, call MOPH on 1214 or 76592699. """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') print("nlptriggered") input_ids = tokenizer.encode(question, text) token_type_ids = [ 0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids)) ] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor( [token_type_ids])) print("nlptriggered") all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join( all_tokens[torch.argmax(start_scores):torch.argmax(end_scores) + 1]).replace(' ##', '') print(answer) return answer
def configure_tokenizer_model_bert(args, logger, is_preprocess=False): logger.info("***** Loading tokenizer *****") tokenizer = BertTokenizer.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, do_lower_case=args.do_lower_case) # logger.info("Loading configuration from {}".format(args.cache_dir)) logger.info("***** Loading configuration from {} ******".format( args.init_dir)) config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, cache_dir=args.init_dir) config.vocab_size = len(tokenizer.vocab) logger.info("***** Loading pretrained model from {} *****".format( args.init_dir)) if is_preprocess: model = AutoModel.from_pretrained(args.model_name_or_path, config=config, cache_dir=args.init_dir) else: model = BertForQuestionAnswering.from_pretrained( args.init_dir, config=config, cache_dir=args.init_dir) return tokenizer, model
def create_graphics(self, url_base, model_card_path): pruned_heads = self.checkpoint_info["config"].get("pruned_heads") ret = {} if pruned_heads is not None: pruning_info_plotter = PruningInfoBokehPlotter( "pruning_info", self.JS_PATH) fig, js, html = pruning_info_plotter.run(layer_count=12, pruned_heads=pruned_heads, heads_count=12) ret["pruning_info"] = dict(js=js, html=html) density_plotter = DensityBokehPlotter("density", self.JS_PATH) model = BertForQuestionAnswering.from_pretrained(self.git_path) fig, js, html = density_plotter.run(model=model, dest_path=model_card_path / "images", url_base=url_base + "/images") ret["density_info"] = dict(js=js, html=html) from bokeh.io import export_png export_png(fig, filename="/tmp/plot.png") return ret
def load_model(self): config = BertConfig.from_pretrained(self.c_path) self.model = BertForQuestionAnswering.from_pretrained( 'bert-base-uncased', config=config) self.model.to(self.device) self.model.eval() return self.model
def answer_question(question, answer_text, model_name=None, tokenizer_name=None): """ Takes a `question` string and an `answer_text` string (which contains the answer), and identifies the words within the `answer_text` that are the answer. Parameters ---------- question : str answer_text : str model : str tokenizer : str Return ------- answer : str """ # ======== Model & Tokenizer (default: bert-large finetuned squad ver.1)======== if model_name is None: model_name = 'bert-large-uncased-whole-word-masking-finetuned-squad' if tokenizer_name is None: tokenizer_name = 'bert-large-uncased-whole-word-masking-finetuned-squad' model = BertForQuestionAnswering.from_pretrained(model_name) tokenizer = BertTokenizer.from_pretrained(tokenizer_name) # ======== Tokenize ======== input_ids = tokenizer.encode(question, answer_text) # Report how long the input sequence is. # print(f"Query has {len(input_ids):,} tokens.\n") # ======== Set Segment IDs ======== sep_index = input_ids.index(tokenizer.sep_token_id) num_seg_a = sep_index + 1 num_seg_b = len(input_ids) - num_seg_a segment_ids = [0]*num_seg_a + [1]*num_seg_b # There should be a segment_id for every input token. assert len(segment_ids) == len(input_ids) # ======== Evaluate ======== start_scores, end_scores = model( torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) ) # ======== Reconstruct Answer ======== answer_start = torch.argmax(start_scores) answer_end = torch.argmax(end_scores) tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = tokens[answer_start] for i in range(answer_start + 1, answer_end + 1): if tokens[i][0:2] == '##': answer += tokens[i][2:] else: answer += ' ' + tokens[i] return answer
def __init__(self, model_dir, cache_dir): # assert the config file, pretrained model existing in the dir self.config = BertConfig.from_pretrained(model_dir, cache_dir=cache_dir) self.tokenizer = BertTokenizer.from_pretrained(model_dir, cache_dir=cache_dir) self.model = BertForQuestionAnswering.from_pretrained( model_dir, cache_dir=cache_dir)
def __init__(self, model: str = None, lowercase=True, tokenizer=BertTokenizer): self.lowercase = lowercase self.tokenizer = tokenizer.from_pretrained(model) self.model = BertForQuestionAnswering.from_pretrained(model)
def __init__( self, pre_trained_name='bert-large-uncased-whole-word-masking-finetuned-squad' ): self.pre_trained_name = pre_trained_name self.model = BertForQuestionAnswering.from_pretrained( self.pre_trained_name) self.tokenizer = BertTokenizer.from_pretrained(self.pre_trained_name)
def __init__(self): # BERT Finetuned on SQUAD self.bert_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') self.squad_finetuned_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') self.squad_finetuned_model = self.squad_finetuned_model.eval() self.squad_finetuned_model = self.squad_finetuned_model.to(device)
def init_bert(): global bert_model global bert_tokenizer from transformers import BertForQuestionAnswering from transformers import BertTokenizer bert_model = BertForQuestionAnswering.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad') bert_tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased-whole-word-masking-finetuned-squad')
def main(): parser = get_parser() args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Set device args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logging.getLogger("transformers.generation_utils").setLevel(logging.ERROR) # Load pretrained question generation model and tokenizer GPT2_tokenizer = GPT2Tokenizer.from_pretrained( args.question_generation_model, do_lower_case=args.do_lower_case) GPT2_model = GPT2LMHeadModel.from_pretrained( args.question_generation_model) GPT2_model.prepare_inputs_for_generation = prepare_inputs_for_generation GPT2_model.eval() GPT2_model.to(args.device) BERT_tokenizer = BertTokenizer.from_pretrained( args.answering_model, do_lower_case=args.do_lower_case) BERT_model = BertForQuestionAnswering.from_pretrained(args.answering_model) BERT_model.eval() BERT_model.to(args.device) logging.info("Parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: from apex import amp amp.register_half_function(torch, "einsum") GPT2_model = amp.initialize(GPT2_model, opt_level=args.fp16_opt_level) BERT_model = amp.initialize(BERT_model, opt_level=args.fp16_opt_level) except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) generate(args, GPT2_tokenizer, GPT2_model, BERT_tokenizer, BERT_model)
def model_pick(id): if (id == 0): tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') if (id == 1): tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad") model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad") return tokenizer, model
def __init__( self, pretrained='bert-large-uncased-whole-word-masking-finetuned-squad' ): self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu' self.QA_MODEL = Bert4QA.from_pretrained(pretrained) self.QA_MODEL.to(self.torch_device) self.QA_MODEL.eval() self.QA_TOKENIZER = BertTokenizer.from_pretrained(pretrained)
def __init__(self, args): self.args = args self.model = BertForQuestionAnswering.from_pretrained(self.args.model_path).to(self.args.device) self.tokenizer = BertTokenizer.from_pretrained('hfl/chinese-roberta-wwm-ext') self.dataset = CMRC2018(args=args, tokenizer=self.tokenizer)() self.validation_dataloader = DataLoader(self.dataset['validation'], batch_size=self.args.batch_size, collate_fn=custom_collate, num_workers=self.args.num_workers) pass
def load_model(self): # Load a pretrained model that has been fine-tuned config = BertConfig.from_pretrained(self.model_type, output_hidden_states=True, cache_dir=self.cache_dir) pretrained_weights = torch.load(self.model_path, map_location=torch.device(self.device)) model = BertForQuestionAnswering.from_pretrained(self.model_type, state_dict=pretrained_weights, config=config, cache_dir=self.cache_dir) return model