def estimate_runtime_examples(data_path, sample_rate, tokenizer, \ max_seq_length, doc_stride, max_query_length, \ remove_impossible_questions=True, filter_invalid_spans=True): """Count runtime examples which may differ from number of raw samples due to sliding window operation and etc.. This is useful to get correct warmup steps for training.""" assert sample_rate > 0.0 and sample_rate <= 1.0, "sample_rate must be set between 0.0~1.0" print("loading data with json parser...") with open(data_path, "r") as reader: data = json.load(reader)["data"] num_raw_examples = 0 for entry in data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] for qa in paragraph["qas"]: num_raw_examples += 1 print("num raw examples:{}".format(num_raw_examples)) def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False sampled_examples = [] for entry in data: for paragraph in entry["paragraphs"]: doc_tokens = None for qa in paragraph["qas"]: if sampled_examples and random.random( ) > sample_rate and sample_rate < 1.0: continue if doc_tokens is None: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) assert len( qa["answers"] ) == 1, "For training, each question should have exactly 1 answer." qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if ('is_impossible' in qa) and (qa["is_impossible"]): if remove_impossible_questions or filter_invalid_spans: continue else: start_position = -1 end_position = -1 orig_answer_text = "" is_impossible = True else: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # remove corrupt samples actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue example = MRQAExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) sampled_examples.append(example) runtime_sample_rate = len(sampled_examples) / float(num_raw_examples) runtime_samp_cnt = 0 for example in sampled_examples: query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if filter_invalid_spans and not (tok_start_position >= doc_start and tok_end_position <= doc_end): continue runtime_samp_cnt += 1 return int(runtime_samp_cnt / runtime_sample_rate)
def read_squad_examples(input_file, is_training, version_2_with_negative): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if is_training: if version_2_with_negative: is_impossible = qa["is_impossible"] if (len(qa["answers"]) != 1) and (not is_impossible): raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample(qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples
def read_squad_examples(input_file, is_training): """Read a SQuAD json file into a list of SquadExample.""" total = 0 with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False examples = [] for entry in tqdm(input_data): for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] ''' char_to_word_offset 用于根据答案和答案开始位置确定答案的起始位置(即模型的输出) ''' prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: uuid = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None if is_training: if len(qa["answers"]) > 1: raise ValueError( "For training, each question should have exactly 0 or 1 answer." ) answer = qa["answers"][0] orig_answer_text = answer["text"] ##按照字符计算从0开始,答案开始的位置 answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) ##转成按照单词计算答案开始的位置 start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. ##doc_tokens是列表里面存放每个单词 actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize(orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: print("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue total += 1 example = SquadExample(uuid=uuid, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position) examples.append(example) ''' #每个问题构造一个example,同一个上下文中有多个问题,要将上下文复制多次 ##所有段落,每个问题成为一个example, [uuid: 5733be284776f41900661182, question_text: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?, doc_tokens: [Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.], start_position: 90, end_position: 92, uuid: 5733be284776f4190066117f, question_text: What is in front of the Notre Dame Main Building?, doc_tokens: [Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.], start_position: 32, end_position: 36, uuid: 5733be284776f41900661180, question_text: The Basilica of the Sacred heart at Notre Dame is beside to which structure?, doc_tokens: [Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.], start_position: 49, end_position: 51] ''' return examples