def __init__(self): ''' PRE-LOAD NECESSARY DATA ''' # print(os.path.join('models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens')) # print(os.path.join('models', 'albert_t')) # print(os.path.join('models', 'albert_m')) # print(os.getcwd()) self.__sentence_model = SentenceTransformer( os.path.join( 'models', 'sbert.net_models_distilbert-base-nli-stsb-mean-tokens')) self.__tokenizer = AlbertTokenizer.from_pretrained( os.path.join('models', 'albert_t')) self.__model = AlbertForQuestionAnswering.from_pretrained( os.path.join('models', 'albert_m')) # Read url file with open(os.path.join('data', 'urls.txt'), 'r') as file: self.urls = file.read().splitlines() file.close() with open(os.path.join('data', 'titles.txt'), 'r') as file: self.titles = file.read().splitlines() file.close() # Load pickle files into variables names = [ os.path.join('data', 'punctuated.pkl'), os.path.join('data', 'punctuated_embed.pkl'), os.path.join('data', 'subs.pkl') ] self.__punctuateds, self.__sentence_embeddings_p, self.__subs = tuple( map(loadPickle, names)) ''' END OF PRE-LOAD NECESSARY DATA '''
def build(args): TAG = create_tags() XLSX_PATH = {'train': 'release/train/ca_data', 'dev': 'release/dev/ca_data', 'test': 'release/test/ca_data'} PRETRAINED_MODEL_NAME = 'ALINEAR/albert-japanese-v2' tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME) train_data = TrainData(XLSX_PATH['train'], TAG, only_positive=args.only_positive) trainset = QADataset(train_data.examples, "train", tokenizer=tokenizer) trainloader = DataLoader(trainset, batch_size=args.batch_size, collate_fn=collate_fn) dev_data = TrainData(XLSX_PATH['dev'], TAG, only_positive=args.only_positive) devset = QADataset(dev_data.examples, "train", tokenizer=tokenizer) devloader = DataLoader(devset, batch_size=args.batch_size, collate_fn=collate_fn) logger.info(f"[train data] {train_data.summary()}") logger.info(f"[dev data] {dev_data.summary()}") test_data = TestData(XLSX_PATH['dev'], TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=args.batch_size, collate_fn=collate_fn) model = AlbertForQuestionAnswering.from_pretrained(PRETRAINED_MODEL_NAME) model = model.to(args.device) if args.load_pretrained_model: model.load_state_dict(torch.load(args.pretrained_model_path)) return model, trainloader, devloader, testloader, tokenizer
def __init__(self, name, path: str, gpu=False): self.tokenizer = AlbertTokenizer.from_pretrained(path) pretrained_albert_model = AlbertForQuestionAnswering.from_pretrained( path) super().__init__(name, pretrained_albert_model, gpu) if self.gpu: self.predictor.cuda()
def load_and_predict(data_dir, model_type, pretrain_model): if model_type == 'bert_japanese': model = BertForQuestionAnswering.from_pretrained( 'cl-tohoku/bert-base-japanese') tokenizer = BertJapaneseTokenizer.from_pretrained( 'cl-tohoku/bert-base-japanese') if model_type == 'bert_multilingual': model = BertForQuestionAnswering.from_pretrained( 'bert-base-multilingual-cased') tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-cased', tokenize_chinese_chars=False) if model_type == 'albert': model = AlbertForQuestionAnswering.from_pretrained( 'ALINEAR/albert-japanese-v2') tokenizer = AlbertTokenizer.from_pretrained( 'ALINEAR/albert-japanese-v2') test_data = TestData(data_dir, TAG) testset = QADataset(test_data.examples, "test", tokenizer=tokenizer) testloader = DataLoader(testset, batch_size=4, collate_fn=collate_fn) model = model.to(device) model.load_state_dict(torch.load(pretrain_model)) prediction = predict(model, testloader, device, tokenizer) prediction = func(data_dir, prediction) print('finish loading and predicting from {}!'.format(pretrain_model)) return prediction #prediction dictionary
def load_model(self, model_path: str, do_lower_case=True): config = AlbertConfig.from_pretrained(model_path + "/config.json") tokenizer = AlbertTokenizer.from_pretrained(model_path) #tokenizer = AlbertTokenizer.from_pretrained('albert-large-v2', do_lower_case=do_lower_case) model = AlbertForQuestionAnswering.from_pretrained(model_path, from_tf=False, config=config) return model, tokenizer
def load_model(pretrained_model): # Other models to try: albert-large-v2, albert-xlarge-v2 # https://huggingface.co/transformers/pretrained_models.html tokenizer = AlbertTokenizer.from_pretrained(pretrained_model, do_lower_case=True) model = AlbertForQuestionAnswering.from_pretrained(pretrained_model, cache_dir="/usr/cache") return model, tokenizer
def load_models(): # Download files locally if not exist from S3 bucket s3_bucket = boto3.resource('s3').Bucket('albert-model-files') for object in s3_bucket.objects.all(): if object.key in ["config.json", "vocab.txt", "pytorch_model.bin"]: if not os.path.exists('model_data/{}'.format(object.key)): s3_bucket.download_file(object.key, 'model_data/{}'.format(object.key)) for object in s3_bucket.objects.all(): if object.key in [ "special_tokens_map.json", "spiece.model", "tokenizer_config.json" ]: if not os.path.exists('tokenizer_albert/{}'.format(object.key)): s3_bucket.download_file( object.key, 'tokenizer_albert/{}'.format(object.key)) # Load pretrained models tokenizer = AlbertTokenizer.from_pretrained('./tokenizer_albert') model = AlbertForQuestionAnswering.from_pretrained('./model_data') return model, tokenizer
def main(args): if args.large: args.train_record_file += '_large' args.dev_eval_file += '_large' model_name = "albert-xlarge-v2" else: model_name = "albert-base-v2" if args.xxlarge: args.train_record_file += '_xxlarge' args.dev_eval_file += '_xxlarge' model_name = "albert-xxlarge-v2" # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') if args.bidaf: char_vectors = util.torch_from_json(args.char_emb_file) if args.model_name == 'albert_highway': model = models.albert_highway(model_name) elif args.model_name == 'albert_lstm_highway': model = models.LSTM_highway(model_name, hidden_size=args.hidden_size) elif args.model_name == 'albert_bidaf': model = models.BiDAF(char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.model_name == 'albert_bidaf2': model = models.BiDAF2(model_name=model_name, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) else: model = AlbertForQuestionAnswering.from_pretrained(args.model_name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2, args.bidaf) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) with open(args.dev_gold_file) as f: gold_dict = json.load(f) tokenizer = AlbertTokenizer.from_pretrained(model_name) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch in train_loader: batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], 'start_positions': batch[3], 'end_positions': batch[4], } if args.bidaf: inputs['char_ids'] = batch[6] y1 = batch[3] y2 = batch[4] # Setup for forward batch_size = inputs["input_ids"].size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(**inputs) y1, y2 = y1.to(device), y2.to(device) outputs = model(**inputs) loss = outputs[0] loss = loss.mean() # loss_fct = nn.CrossEntropyLoss() # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2) # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(args, model, dev_dataset, dev_loader, gold_dict, tokenizer, device, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
else: write_in_result_file("Bert") write_in_result_file("k = " + str(k)) if use_ir_score: write_in_result_file('Using IR score with mu = ' + str(mu_bench)) else: write_in_result_file('Not using IR score') if use_albert: if not use_dil: tokenizer = AlbertTokenizer.from_pretrained( args.albert_path, do_lower_case=True) model = AlbertForQuestionAnswering.from_pretrained( args.albert_path) else: tokenizer = AlbertTokenizer.from_pretrained( args.dilalbert_path, do_lower_case=True) model = DilAlbert.from_pretrained( args.dilalbert_path) else: if not use_dil: tokenizer = BertTokenizer.from_pretrained( args.bert_path, do_lower_case=True) model = BertForQuestionAnswering.from_pretrained( args.bert_path) else: tokenizer = BertTokenizer.from_pretrained( args.dilbert_path, do_lower_case=True) model = DilBert.from_pretrained(args.dilbert_path)
# ner_reljson=tkitFile.Json("../tdata/onlyner/dev.json") # i=0 # all=0 # # ner_list=ner_plus(text) # for item in ner_reljson.auto_load(): # The checkpoint albert-base-v2 is not fine-tuned for question answering. Please see the # examples/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers import AlbertTokenizer, AlbertForQuestionAnswering, BertTokenizer, AlbertConfig import torch # tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = BertTokenizer.from_pretrained('tkitfiles/qa/model/') # config=AlbertConfig.from_pretrained('tkitfiles/qa/model/config.json') model = AlbertForQuestionAnswering.from_pretrained('tkitfiles/qa/model/') data = tkitFile.Json("../tdata/SQuAD/dev.json") i = 0 all = 0 f = 0 for item in data.auto_load(): for one in item['data']: all = all + 1 # print(one['paragraphs'][0]) # print(one['paragraphs'][0]['context']) question, text = one['paragraphs'][0]['qas'][0]['question'], one[ 'paragraphs'][0]['context'] # question, text = "利比里亚共和国", "利比里亚共和国(英语:') 通称赖比瑞亚,是位于西非,北接几内亚,西北界塞拉利昂,东邻象牙海岸,西南濒大西洋的总统制共和国家" input_dict = tokenizer.encode_plus(question, text, return_tensors='pt')
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, required=True, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model checkpoints and predictions will be written." ) ## Other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( '--version_2_with_negative', action='store_true', help= 'If true, the SQuAD examples contain some that do not have an answer.') parser.add_argument( '--null_score_diff_threshold', type=float, default=0.0, help= "If null_score - best_non_null is greater than the threshold predict null." ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json output file." ) parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] #config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, # cache_dir=args.cache_dir if args.cache_dir else None) #tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, # do_lower_case=args.do_lower_case, # cache_dir=args.cache_dir if args.cache_dir else None) #model = model_class.from_pretrained(args.model_name_or_path, # from_tf=bool('.ckpt' in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None) config = AlbertConfig.from_pretrained(args.model_name_or_path + "/config.json") tokenizer = AlbertTokenizer.from_pretrained( 'albert-large-v2', do_lower_case=args.do_lower_case) model = AlbertForQuestionAnswering.from_pretrained(args.model_name_or_path, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, 'einsum') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Save the trained model and the tokenizer if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir, force_download=True) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(args.device) # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} """ if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint, force_download=True) model.to(args.device) # Evaluate result = evaluate(args, model, tokenizer, prefix=global_step) result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items()) results.update(result) """ results = evaluate(args, model, tokenizer) print(results) logger.info("Results: {}".format(results)) return results
def do_prediction(model_dir, model_name, questions_dir): # 1. Load a trained model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AlbertForQuestionAnswering.from_pretrained(model_dir) model.to(device) model.eval() # 2. Load and pre-process the test set dev_file = questions_dir #"data/sfu.json" predict_batch_size = 2 max_seq_length = 384 eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False) tokenizer = AlbertTokenizer.from_pretrained(model_dir) eval_features = convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size) # 3. Run inference on the test set all_results = [] for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, input_mask, segment_ids) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join( model_dir, model_name + "_albert_predictions_sfu.json") output_nbest_file = os.path.join( model_dir, model_name + "_albert_nbest_predictions_sfu.json") output_null_log_odds_file = os.path.join( model_dir, model_name + "_null_odds_sfu.json") preds = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, 0.0)
"input_ids": torch.cat((input_ids,inputs_par[i]["input_ids"]),1), "attention_mask": torch.cat((attention_mask,inputs_par[i]["attention_mask"]),1), "token_type_ids": torch.cat((token_type_ids,inputs_par[i]["token_type_ids"]),1), } start = time.time() outputs = model.process_B(preprocessed_question, preprocessed_paragraph,**inputs) total_time_questions_paragraphs_pairs = total_time_questions_paragraphs_pairs + time.time() - start print("NI Q DilAlbert :", total_time_questions) print("I Q-P DilAlbert :", total_time_questions_paragraphs_pairs) total_dilalbert = total_time_passages+total_time_questions+total_time_questions_paragraphs_pairs print("Total DilAlbert : ", total_dilalbert) tokenizer = AlbertTokenizer.from_pretrained(ALBERT_PATH, do_lower_case=True) model = AlbertForQuestionAnswering.from_pretrained(ALBERT_PATH) model.to(torch.device(device)) total_time_questions_paragraphs_pairs_albert = 0 eval_dataloader = DataLoader(dataset, batch_size=1) for question in squad1_for_orqa["questions"][:n_questions]: input_ids = torch.tensor([tokenizer.encode(question)], device=device) attention_mask = torch.tensor([[1]*input_ids.shape[1]], device=device) token_type_ids = torch.tensor([[0]*input_ids.shape[1]], device=device) for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": torch.cat((input_ids,batch[0][:,3:]),1), "attention_mask": torch.cat((attention_mask,batch[1][:,3:]),1), "token_type_ids": torch.cat((token_type_ids,batch[2][:,3:]),1),
class SquadDataset(torch.utils.data.Dataset): def __init__(self, encodings): self.encodings = encodings def __getitem__(self, idx): return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} def __len__(self): return len(self.encodings.input_ids) train_dataset = SquadDataset(train_encodings) val_dataset = SquadDataset(val_encodings) from transformers import AlbertForQuestionAnswering model = AlbertForQuestionAnswering.from_pretrained("albert-base-v2") from torch.utils.data import DataLoader from transformers import AdamW device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) model.train() train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) for epoch in range(3): for batch in train_loader:
def document_retriever(question): query_bow = dictionary.doc2bow(jieba.cut(question,cut_all=False)) tfidfvect = tfidf[query_bow] simstfidf = indexTfidf[tfidfvect] return [context[i] for i in (-simstfidf).argsort()[0:1]]#返回前3名 model_path = 'voidful/albert_chinese_base' tokenizer_kwards = {'do_lower_case': False,'max_len': 512} tokenizer = BertTokenizer.from_pretrained(model_path, **tokenizer_kwards) from transformers import AlbertForQuestionAnswering,AutoConfig model_path = 'voidful/albert_chinese_base' bert_config = AutoConfig.from_pretrained(model_path) model = AlbertForQuestionAnswering.from_pretrained(r"checkpoint_score_f1-86.233_em-66.853.pth", **{'config':bert_config}).to('cuda') import torch from torch.utils.data import TensorDataset, DataLoader SPIECE_UNDERLINE = '▁' def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to # whitespace-tokenized words. But then after WordPiece tokenization, we can # often find a "better match". For example: # # Question: What year was John Smith born? # Context: The leader was John Smith (1895-1943).
import torch from transformers import AlbertTokenizer, AlbertForQuestionAnswering tokenizer = AlbertTokenizer.from_pretrained( 'ahotrod/albert_xxlargev1_squad2_512') model = AlbertForQuestionAnswering.from_pretrained( 'ahotrod/albert_xxlargev1_squad2_512') def answer(question, text): input_dict = tokenizer.encode_plus(question, text, return_tensors='pt', max_length=512) input_ids = input_dict["input_ids"].tolist() start_scores, end_scores = model(**input_dict) start = torch.argmax(start_scores) end = torch.argmax(end_scores) all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) answer = ''.join(all_tokens[start:end + 1]).replace('▁', ' ').strip() answer = answer.replace('[SEP]', '') return answer if answer != '[CLS]' and len( answer) != 0 else 'could not find an answer'
import torch from transformers import AlbertTokenizer, AlbertForQuestionAnswering tokenizer = AlbertTokenizer.from_pretrained('twmkn9/albert-base-v2-squad2') model = AlbertForQuestionAnswering.from_pretrained( 'twmkn9/albert-base-v2-squad2') def answer(question, text): input_dict = tokenizer.encode_plus(question, text, return_tensors='pt', max_length=512) input_ids = input_dict["input_ids"].tolist() start_scores, end_scores = model(**input_dict) start = torch.argmax(start_scores) end = torch.argmax(end_scores) all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0]) answer = ''.join(all_tokens[start:end + 1]).replace('▁', ' ').strip() answer = answer.replace('[SEP]', '') return answer if answer != '[CLS]' and len( answer) != 0 else 'could not find an answer'