def process_inputs(input_data): bert_config = modeling.BertConfig.from_json_file(rs.FLAGS.bert_config_file) eval_examples = read_squad_data(input_data, is_training=False) eval_features = [] eval_writer = rs.FeatureWriter(filename=os.path.join( "./colab_output", "train.tf_record"), is_training=False) def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) # 토크나이저에 사전과 do_lower_case 설정 tokenizer = tokenization.FullTokenizer( vocab_file=rs.FLAGS.vocab_file, do_lower_case=rs.FLAGS.do_lower_case) rs.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=rs.FLAGS.max_seq_length, doc_stride=rs.FLAGS.doc_stride, max_query_length=rs.FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() return eval_examples, eval_features
def get_answer(data): eval_examples = get_squad_examples(data, is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) run_squad.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=DOC_STRIDE, max_query_length=MAX_QUERY_LENGTH, is_training=False, output_fn=append_feature) global answer_model if answer_model == None: answer_model = Model(export_dir) all_results = answer_model.predict(eval_features=eval_features) pred = get_predicted_answer(eval_examples, eval_features, all_results, N_BEST_SIZE, MAX_ANSWER_LENGTH, DO_LOWER_CASE) temp = dict(pred.items()) return temp['1']
def my_create_examples(self, data_object): """ Modified version of read_squad_examples from run_squad. Note that this returns feature objects, not example objects. The feature TENSORS themselves are made elsewhere. :param data_object: equivalent object to the 'data' section of the SQuAD JSON scheme :return: a list of `SquadExample`s """ def is_whitespace(c): return c in " \t\r\n" or ord(c) == 0x202F examples = [] for entry in data_object: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: examples.append(run_squad.SquadExample( qas_id=qa["id"], question_text=qa["question"], doc_tokens=doc_tokens, orig_answer_text=None, start_position=None, end_position=None, is_impossible=False) ) feature_objects = [] run_squad.convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.flags.max_seq_length, doc_stride=self.flags.doc_stride, max_query_length=self.flags.max_query_length, is_training=False, output_fn=feature_objects.append) return feature_objects
def response(self, data): # data = [[context, question], ...] eval_examples = self.process_example(data) eval_writer = FeatureWriter(filename=os.path.join( self.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) convert_examples_to_features(examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() all_results = [] predict_input_fn = input_fn_builder(input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of steps. all_results = [] for result in self.estimator.predict(predict_input_fn, yield_single_examples=True): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) predictions = self.predict(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case) return predictions
def mrc(): data_from_post = getData() data = preprocess_data(data_from_post) eval_writer = mainfile.FeatureWriter(filename=os.path.join( mrc_inference_config["output_dir"], "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) mainfile.convert_examples_to_features( examples=data, tokenizer=tokenizer, max_seq_length=mrc_inference_config["max_seq_length"], doc_stride=mrc_inference_config["doc_stride"], max_query_length=mrc_inference_config["max_query_length"], is_training=False, output_fn=append_feature) eval_writer.close() all_results = [] predict_input_fn = mainfile.input_fn_builder( input_file=eval_writer.filename, seq_length=mrc_inference_config["max_seq_length"], is_training=False, drop_remainder=False) all_results = [] for result in estimator.predict(predict_input_fn, yield_single_examples=True): unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( mainfile.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) answer = mainfile.write_predictions( data, eval_features, all_results, 20, mrc_inference_config["max_answer_length"], True, None, None, None) return sendResponse({"Answer": answer.get(data_from_post.get("qas_id"))})
def process_data_and_get_input_max_min(data_list, fixer, input_tensor_names, num_runs, vocab_file, do_lower_case, seq_length, doc_stride=128, max_query_length=64, batch_size=8, preprocess_fn="default_preprocess"): """Precess input data and get input max and min. """ eval_features = [] def append_feature(feature): eval_features.append(feature) eval_examples = read_squad_examples(input_file=data_list, is_training=False) eval_examples = eval_examples[0:batch_size * num_runs] tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, output_fn=append_feature) input_dicts = [] input_node_names = [ node_name.split(':')[0] for node_name in input_tensor_names ] for i in range(num_runs): inputs = process_feature_batch(eval_features, batch_size, i) input_dict = dict(zip(input_node_names, inputs)) input_dicts.append(input_dict) fixer.get_input_max_min(input_dicts, batch_size) print("quantize input end")
def __init__( self, eval_script: str = "data/squad/v1.1/evaluate-v1.1.py", predict_file: str = "", output_dir: str = "./", n_best_size: int = 20, max_answer_length: int = 30, version_2_with_negative: bool = False, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, vocab_file: str = "", do_lower_case: bool = True, max_len: int = 512, ): tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) # for bert large self.eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) self.eval_features = convert_examples_to_features( examples=self.eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) self.output_dir = output_dir self.eval_script = eval_script self.predict_file = predict_file args = Namespace( version_2_with_negative=version_2_with_negative, n_best_size=n_best_size, max_answer_length=max_answer_length, verbose_logging=False, do_lower_case=do_lower_case, ) self.args = args self.all_results: List[RawResult] = []
def get_dataloader(args): ''' return dataloader for inference ''' # Preprocess input data tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large cached_features_file = args.predict_file + '_{}_{}.bin'.format(args.max_seq_length, args.doc_stride) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) data = [] for feature in eval_features: input_ids = torch.tensor(feature.input_ids, dtype=torch.int64) input_mask = torch.tensor(feature.input_mask, dtype=torch.int64) segment_ids = torch.tensor(feature.segment_ids, dtype=torch.int64) inp = (input_ids, segment_ids, input_mask) data.append(inp) if args.nbatches > 0: data = data[:args.nbatches*args.batch_size] test_loader = torch.utils.data.DataLoader( data, batch_size=args.batch_size, shuffle=False, num_workers=1, pin_memory=True) return test_loader
def _validate_squad(args, model, tokenizer): eval_examples = run_squad.read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) run_squad.logger.info("***** Running predictions *****") run_squad.logger.info(" Num orig examples = %d", len(eval_examples)) run_squad.logger.info(" Num split examples = %d", len(eval_features)) run_squad.logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = run_squad.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) # Run prediction for full data eval_sampler = run_squad.SequentialSampler(eval_data) eval_dataloader = run_squad.DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] run_squad.logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in run_squad.tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: run_squad.logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.cuda() input_mask = input_mask.cuda() segment_ids = segment_ids.cuda() with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( run_squad.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join("predictions.json") output_nbest_file = os.path.join("nbest_predictions.json") output_null_log_odds_file = os.path.join("null_odds.json") run_squad.write_predictions( eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) result = _calc_metric_squad(args.predict_file, output_prediction_file) os.remove(output_prediction_file) os.remove(output_nbest_file) os.remove(output_null_log_odds_file) return result # {'exact_match': exact_match, 'f1': f1}
def _train_squad(args, stage): args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps _set_seed(args.seed) tokenizer = run_squad.BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None train_examples = run_squad.read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs config = run_squad.BertConfig.from_json_file(args.config_file) model: nn.Module = run_squad.BertForQuestionAnswering(config) _load_checkpoint(model, args.init_checkpoint) if stage == PruningPhase.admm: _hard_mask(model, args.sparsity_config) model.cuda() if args.fp16 and args.old: model.half() with open(args.sparsity_config, 'r') as f: raw_dict = yaml.load(f, Loader=yaml.SafeLoader) masks = dict.fromkeys(raw_dict['prune_ratios'].keys()) plain_model = getattr(model, 'module', model) for param_name in masks: param = get_parameter_by_name(plain_model, param_name) if param is None: raise Exception(f'Cannot find {param_name}') non_zero_mask = torch.ne(param, 0).to(param.dtype) masks[param_name] = non_zero_mask # Prepare optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: # from fused_adam_local import FusedAdamBert as FusedAdam from apex.optimizers import FusedAdam from apex.fp16_utils.fp16_optimizer import FP16_Optimizer # from apex.contrib.optimizers import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) # import ipdb; ipdb.set_trace() optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: if args.old: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale="dynamic") else: if args.old: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", keep_batchnorm_fp32=False, loss_scale=args.loss_scale) if not args.old and args.do_train: scheduler = run_squad.LinearWarmUpScheduler( optimizer, warmup=args.warmup_proportion, total_steps=num_train_optimization_steps) else: optimizer = run_squad.BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) model = torch.nn.DataParallel(model) global_step = 0 cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) # train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = run_squad.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank == -1 or torch.distributed.get_rank() == 0: run_squad.logger.info( " Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) run_squad.logger.info("***** Running training *****") run_squad.logger.info(" Num orig examples = %d", len(train_examples)) run_squad.logger.info(" Num split examples = %d", len(train_features)) run_squad.logger.info(" Batch size = %d", args.train_batch_size) run_squad.logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = run_squad.TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = run_squad.RandomSampler(train_data) train_dataloader = run_squad.DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in run_squad.trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( run_squad.tqdm(train_dataloader, desc="Iteration")): # Terminate early for benchmarking if args.max_steps > 0 and global_step > args.max_steps: break if torch.cuda.device_count() == 1: batch = tuple( t.cuda() for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if torch.cuda.device_count() > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: if args.old: # noinspection PyUnboundLocalVariable optimizer.backward(loss) else: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # if args.fp16: # optimizer.backward(loss) # else: # loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up for BERT which FusedAdam doesn't do if not args.old: # noinspection PyUnboundLocalVariable scheduler.step() else: lr_this_step = args.learning_rate * run_squad.warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 plain_model = getattr(model, 'module', model) for param_name, mask in masks.items(): param = get_parameter_by_name(plain_model, param_name) param.data *= mask.to(param.dtype) if step % args.log_freq == 0: # logger.info("Step {}: Loss {}, LR {} ".format(global_step, loss.item(), lr_this_step)) run_squad.logger.info("Step {}: Loss {}, LR {} ".format( global_step, loss.item(), optimizer.param_groups[0]['lr'])) return model, tokenizer
def do_predict(self, json_data): eval_examples = self.read_squad_examples(input_data=json_data, is_training=False) eval_writer = run_squad.FeatureWriter(filename=os.path.join( run_squad.FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=self.tokenizer, max_seq_length=run_squad.FLAGS.max_seq_length, doc_stride=run_squad.FLAGS.doc_stride, max_query_length=run_squad.FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", run_squad.FLAGS.predict_batch_size) all_results = [] predict_input_fn = run_squad.input_fn_builder( input_file=eval_writer.filename, seq_length=run_squad.FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for result in self.estimator.predict(predict_input_fn, yield_single_examples=True): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( run_squad.RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(run_squad.FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(run_squad.FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(run_squad.FLAGS.output_dir, "null_odds.json") return self.write_predictions( eval_examples, eval_features, all_results, run_squad.FLAGS.n_best_size, run_squad.FLAGS.max_answer_length, run_squad.FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
cached_features_file = args.predict_file + '_{}_{}.bin'.format( args.max_seq_length, args.doc_stride) eval_examples = read_squad_examples( input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative) try: with open(cached_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) with open(cached_features_file, "wb") as writer: pickle.dump(eval_features, writer) dtype = np.int64 def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): unique_ids = () example_indices = () input_ids_data = () input_mask_data = ()
def get_dataloader_fn( precision: str = 'fp32', batch_size: int = 8, vocab_file: str = "", do_lower_case: bool = True, predict_file: str = "", max_len: int = 512, max_seq_length: int = 384, doc_stride: int = 128, max_query_length: int = 64, version_2_with_negative: bool = False, pad_to_batch_size: bool = True, ): # Preprocess input data tokenizer = BertTokenizer(vocab_file, do_lower_case=do_lower_case, max_len=max_len) eval_examples = read_squad_examples( input_file=predict_file, is_training=False, version_2_with_negative=version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False, ) # get inputs all_unique_ids = [f.unique_id for f in eval_features] all_input_ids = [f.input_ids for f in eval_features] all_input_mask = [f.input_mask for f in eval_features] all_segment_ids = [f.segment_ids for f in eval_features] if pad_to_batch_size: # each batch should have a fixed size f = eval_features[-1] padding = batch_size - (len(all_unique_ids) % batch_size) all_unique_ids += [f.unique_id for _ in range(padding)] all_input_ids += [f.input_ids for _ in range(padding)] all_input_mask += [f.input_mask for _ in range(padding)] all_segment_ids += [f.segment_ids for _ in range(padding)] all_unique_ids = torch.tensor(all_unique_ids, dtype=torch.int32, requires_grad=False) all_input_ids = torch.tensor(all_input_ids, dtype=torch.int32, requires_grad=False) all_input_mask = torch.tensor(all_input_mask, dtype=torch.int32, requires_grad=False) all_segment_ids = torch.tensor(all_segment_ids, dtype=torch.int32, requires_grad=False) eval_data = torch.utils.data.TensorDataset(all_unique_ids, all_input_ids, all_input_mask, all_segment_ids) eval_sampler = torch.utils.data.SequentialSampler(eval_data) eval_dataloader = torch.utils.data.DataLoader( eval_data, sampler=eval_sampler, batch_size=batch_size, shuffle=False, num_workers=0, ) dtype = {'fp32': np.float32, 'fp16': np.float16} dtype = dtype[precision] def _get_dataloader(): """return dataloader for inference""" for unique_id, input_ids, input_mask, segment_ids in eval_dataloader: unique_id = unique_id.cpu().numpy() input_ids = input_ids.cpu().numpy() input_mask = input_mask.cpu().numpy() segment_ids = segment_ids.cpu().numpy() x = { "input__0": input_ids, "input__1": segment_ids, "input__2": input_mask } y_real = { "output__0": np.zeros([batch_size, max_seq_length], dtype=dtype), "output__1": np.zeros([batch_size, max_seq_length], dtype=dtype), } yield (unique_id, x, y_real) return _get_dataloader
# is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) #eval_writer.process_feature(feature) tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=True) run_squad.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, max_seq_length=seq_length, doc_stride=doc_stride, max_query_length=query_length, is_training=False, output_fn=append_feature) ########### Re-load model from saved checkpoint ########### #unique_ids = tf.placeholder([], tf.int64) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) (start_logits, end_logits) = run_squad.create_model(bert_config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask,
def get_dataset(self, dataset_path, is_training, context_truncated_len=400, utterance_truncated_len=100): examples = read_squad_examples(dataset_path, is_training) if self.ctx_emb == 'bert': tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') elif self.ctx_emb == 'xlnet': tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') features = convert_examples_to_features(examples, tokenizer, max_seq_length=2500, doc_stride=2500, max_query_length=2500, is_training=is_training) with open(dataset_path) as f: raw_examples = json.load(f) # since problems are flatten by convert_examples_to_features index_feature = 0 for example in tqdm(raw_examples['data']): for paragraph in example['paragraphs']: paragraph['context_raw'] = paragraph['context'] # Since only `qa_feature.token_to_orig_map` (below) maps token # to space-splited-word-level indices in the context, # `word_offsets` is required to map space-splited-word-level # indices to char-level indices. word_offsets = [0] for word in paragraph['context'].split(' '): word_offsets.append(len(word) + 1 + word_offsets[-1]) for index_q, qa in enumerate(paragraph['qas']): qa_feature = features[index_feature] index_feature += 1 # in `features[index_feature].segment_ids`, question and # context are concatenated. To seperate them, 0/1 stored # in `segment_ids` are used. question_len = qa_feature.segment_ids.index(1) question = qa_feature.input_ids[:question_len] if index_q == 0: # do only once for a paragraph context_len = \ qa_feature.segment_ids[question_len:].index(0) context = ( # [question[0]] # [CLS] token qa_feature.input_ids[question_len:question_len + context_len]) paragraph['context_offset'] = ( # [0] [ word_offsets[qa_feature.token_to_orig_map[i]] for i in range(question_len, question_len + context_len - 1) ] + [len(paragraph['context'])]) paragraph['context_tokenized'] = qa_feature.input_ids paragraph['context'] = context qa['question_tokenized'] = tokenizer.tokenize( qa['question']) qa['question'] = question qa['orig_answer_raw'] = qa['orig_answer']['text'] qa['orig_answer_text'] = tokenizer.tokenize( qa['orig_answer_raw']) qa['orig_answer_start'] = qa_feature.start_position - question_len qa['orig_answer_end'] = qa_feature.end_position - question_len assert qa['orig_answer_end'] < len(paragraph['context']) # answer indicator for previous questions qa['answer_indicator'] = [0] * context_len for offset in range(1, min(3 + 1, index_q + 1)): index_prev = index_q - offset start, end = ( paragraph['qas'][index_prev]['orig_answer_start'], paragraph['qas'][index_prev]['orig_answer_end'] + 1) qa['answer_indicator'][start:end] = ([offset] * (end - start)) if is_training: for answer in qa['answers']: answer['raw'] = answer['text'] answer['text'] = tokenizer.tokenize(answer['text']) return QuACDataset(raw_examples['data'], context_truncated_len=context_truncated_len, utterance_truncated_len=utterance_truncated_len, padding=0)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = rs.modeling.BertConfig.from_json_file(FLAGS.bert_config_file) rs.validate_flags_or_throw(bert_config) tf.gfile.MakeDirs(FLAGS.output_dir) tokenizer = rs.tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=FLAGS.master, model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) train_examples = None num_train_steps = None num_warmup_steps = None if FLAGS.do_train: train_examples = rs.read_squad_examples( input_file=FLAGS.train_file, is_training=True) num_train_steps = int( len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) # Pre-shuffle the input to avoid having to make a very large shuffle # buffer in in the `input_fn`. rng = random.Random(12345) rng.shuffle(train_examples) model_fn = rs.model_fn_builder( bert_config=bert_config, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu) # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=run_config, train_batch_size=FLAGS.train_batch_size, predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: # We write to a temporary file to avoid storing very large constant tensors # in memory. train_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "train.tf_record"), is_training=True) rs.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=True, output_fn=train_writer.process_feature) train_writer.close() tf.logging.info("***** Running training *****") tf.logging.info(" Num orig examples = %d", len(train_examples)) tf.logging.info(" Num split examples = %d", train_writer.num_features) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) del train_examples train_input_fn = rs.input_fn_builder( input_file=train_writer.filename, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_predict: eval_examples = rs.read_squad_examples( input_file=FLAGS.predict_file, is_training=False) act_seq_len = get_act_seq_len(eval_examples, tokenizer, FLAGS.max_seq_length, FLAGS.doc_stride, FLAGS.max_query_length) eval_writer = rs.FeatureWriter( filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), is_training=False) eval_features = [] def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) rs.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=FLAGS.max_seq_length, doc_stride=FLAGS.doc_stride, max_query_length=FLAGS.max_query_length, is_training=False, output_fn=append_feature) eval_writer.close() tf.logging.info("***** Running predictions *****") tf.logging.info(" Num orig examples = %d", len(eval_examples)) tf.logging.info(" Num split examples = %d", len(eval_features)) tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) all_results = [] predict_input_fn = rs.input_fn_builder( input_file=eval_writer.filename, seq_length=FLAGS.max_seq_length, is_training=False, drop_remainder=False) # If running eval on the TPU, you will need to specify the number of # steps. all_results = [] for idx, result in enumerate(estimator.predict( predict_input_fn, yield_single_examples=True)): if len(all_results) % 1000 == 0: tf.logging.info("Processing example: %d" % (len(all_results))) unique_id = int(result["unique_ids"]) start_logits = [float(x) for x in result["start_logits"].flat] end_logits = [float(x) for x in result["end_logits"].flat] all_results.append( rs.RawResult( unique_id=unique_id, start_logits=start_logits[:act_seq_len[idx]], end_logits=end_logits[:act_seq_len[idx]])) output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") rs.write_predictions(eval_examples, eval_features, all_results, FLAGS.n_best_size, FLAGS.max_answer_length, FLAGS.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file)
# Predict all tokens start_logits, end_logits = model(tokens_tensor, segments_tensors,input_mask) start_ind=torch.argmax(start_logits).item() end_ind=torch.argmax(end_logits).item() print(all_tokens[start_ind:end_ind+1]) # #Messing around, trying to recreate what happened in run_squad.py predict_file='/data/squad/dev-v1.1.json' #eval_examples is a list of 10570 'SquadExample' objects #each object contains fields for qas_id, question_text, and doc_tokens, eval_examples = run_squad.read_squad_examples(input_file=predict_file, is_training=False) eval_features = run_squad.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False) #write_predictions(eval_examples, eval_features, all_results, # args.n_best_size, args.max_answer_length, # args.do_lower_case, output_prediction_file, # output_nbest_file, args.verbose_logging)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--json_file", default=None, type=str, help= "predictions jsonfile location (output of run_squad). E.g., train-v1.1.json" ) parser.add_argument( "--output_dir", default=None, type=str, help= "The output directory where the model checkpoints and predictions will be written." ) parser.add_argument("--OG", action='store_true', help="test") args = parser.parse_args() with open(args.json_file, "r", encoding='utf-8') as reader: input_data = json.load(reader) # if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) train_examples = run_squad.read_squad_examples( args.json_file, is_training=True, version_2_with_negative=True) max_seq_len = 384 max_query_len = 64 max_answer_len = 30 exceed_seq_lens = [] exceed_query_lens = [] exceed_answer_lens = [] exceed_seq_len_counter = 0 exceed_query_len_counter = 0 exceed_answer_len_counter = 0 overall_counter = 0 max_s = 0 max_q = 0 max_a = 0 tokenizer = BertTokenizer.from_pretrained( 'bert-large-uncased', do_lower_case=True) # added_flag, currently hardcoded train_features = run_squad.convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=512, doc_stride=128, max_query_length=512, is_training=True) for example in train_features: overall_counter += 1 if sum(example.input_mask) > max_seq_len: exceed_seq_lens.append(example.tokens) exceed_seq_len_counter += 1 if sum(example.input_mask) > max_s: max_s = sum(example.input_mask) if sum(example.segment_ids_flipped) > max_query_len: exceed_query_lens.append(example.tokens) exceed_query_len_counter += 1 if sum(example.segment_ids_flipped) > max_q: max_q = sum(example.segment_ids_flipped) if (example.end_position - example.start_position) > max_answer_len: exceed_answer_len_counter += 1 exceed_answer_lens.append(example.tokens) if (example.end_position - example.start_position) > max_a: max_a = (example.end_position - example.start_position) print("Number of examples: %d." % overall_counter) print("Number of sequences that exceeded max_seq_len of %d is %d." % (max_seq_len, exceed_seq_len_counter)) print("Number of queries that exceeded max_query_len of %d is %d." % (max_query_len, exceed_query_len_counter)) print("Number of answers that exceeded max_answer_len of %d is %d." % (max_answer_len, exceed_answer_len_counter)) print("Max seq length found was %d." % max_s) print("Max query length found was %d." % max_q) print("Max answer length found was %d." % max_a)