def convert_documents_to_features(examples, tokenizer, max_seq_length, doc_stride): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(tqdm(examples, desc='converting')): tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - 2 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} token_is_max_context = {} tokens.append("[CLS]") for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) tokens.append("[SEP]") input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length if example_index < 20: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask)) unique_id += 1 return features
def convert_documents_to_features(examples, tokenizer, max_seq_length, doc_stride): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] for (example_index, example) in enumerate(tqdm(examples, desc='Converting documents')): # Creating a map between word <=> (sub)token tok_to_word_index = [] word_to_tok_index = [] # word to (start of) subtokens all_doc_tokens = [] for (i, word) in enumerate(example.doc_words): word_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(word) for sub_token in sub_tokens: tok_to_word_index.append(i) all_doc_tokens.append(sub_token) # The -2 accounts for [CLS], [SEP] max_tokens_for_doc = max_seq_length - 2 # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP] _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_tok_offset = 0 # From all_doc_tokens # Get doc_spans with stride and offset while start_tok_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_tok_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_tok_offset, length=length)) if start_tok_offset + length == len(all_doc_tokens): break start_tok_offset += min( length, doc_stride) # seems to prefer doc_stride always assert doc_stride < length, "length is no larger than doc_stride for {}".format( doc_spans) # Iterate each doc_span and make out_tokens for (doc_span_index, doc_span) in enumerate(doc_spans): out_tokens = [] # doc out_tokens.append("[CLS]") token_to_word_map = { } # The difference with tok_to_word_index is it includes special tokens token_is_max_context = {} # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_word_map[len( out_tokens)] = tok_to_word_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(out_tokens)] = is_max_context out_tokens.append(all_doc_tokens[split_token_index]) out_tokens.append("[SEP]") # Convert to ids and masks input_ids = tokenizer.convert_tokens_to_ids(out_tokens) input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length # Printing for debug if example_index < 1 and doc_span_index < 1: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens])) logger.info("token_to_word_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_word_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) # Append feature features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=out_tokens, token_to_word_map=token_to_word_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask)) unique_id += 1 return features
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] question_features = [] for (example_index, example) in enumerate(tqdm(examples, desc='converting')): query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) tok_start_position = None tok_end_position = None if is_training: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - 2 # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] tokens_ = [] token_to_orig_map = {} token_is_max_context = {} tokens.append("[CLS]") tokens_.append("[CLS]") for token in query_tokens: tokens_.append(token) tokens_.append("[SEP]") for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[len( tokens)] = tok_to_orig_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(all_doc_tokens[split_token_index]) tokens.append("[SEP]") input_ids = tokenizer.convert_tokens_to_ids(tokens) input_ids_ = tokenizer.convert_tokens_to_ids(tokens_) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) input_mask_ = [1] * len(input_ids_) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length while len(input_ids_) < max_query_length + 2: input_ids_.append(0) input_mask_.append(0) assert len(input_ids_) == max_query_length + 2 assert len(input_mask_) == max_query_length + 2 start_position = None end_position = None if example.start_position is not None and example.start_position < 0: start_position, end_position = -1, -1 elif is_training: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if (example.start_position < doc_start or example.end_position < doc_start or example.start_position > doc_end or example.end_position > doc_end): continue doc_offset = 1 start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset if example_index < 20: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info( "tokens: %s" % " ".join([tokenization.printable_text(x) for x in tokens])) logger.info("token_to_orig_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) if is_training: answer_text = " ".join( tokens[start_position:(end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) logger.info("answer: %s" % (tokenization.printable_text(answer_text))) features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, start_position=start_position, end_position=end_position)) question_features.append( QuestionFeatures(unique_id=unique_id, example_index=example_index, input_ids=input_ids_, input_mask=input_mask_, tokens=tokens_)) unique_id += 1 return features, question_features
def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, return_answers, skip_no_answer, verbose=False, save_with_prob=False, msg="Converting examples"): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] question_features = [] for (example_index, example) in enumerate(tqdm(examples, desc=msg)): # Tokenize query into (sub)tokens query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] # Creating a map between word <=> (sub)token tok_to_word_index = [] word_to_tok_index = [] # word to (start of) subtokens all_doc_tokens = [] for (i, word) in enumerate(example.doc_words): word_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(word) for sub_token in sub_tokens: tok_to_word_index.append(i) all_doc_tokens.append(sub_token) # The -2 accounts for [CLS], [SEP] max_tokens_for_doc = max_seq_length - 2 # Split sequence by max_seq_len with doc_stride, _DocSpan is based on tokens without [CLS], [SEP] _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_tok_offset = 0 # From all_doc_tokens # Get doc_spans with stride and offset while start_tok_offset < len(all_doc_tokens): length = len(all_doc_tokens) - start_tok_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_tok_offset, length=length)) if start_tok_offset + length == len(all_doc_tokens): break start_tok_offset += min( length, doc_stride) # seems to prefer doc_stride always assert doc_stride < length, "length is no larger than doc_stride for {}".format( doc_spans) # Iterate each doc_span and make out_tokens for (doc_span_index, doc_span) in enumerate(doc_spans): # Find answer position based on new out_tokens start_position = None end_position = None # For no_answer, same (-1, -1) applies if example.start_position is not None and example.start_position < 0: assert example.start_position == -1 and example.end_position == -1 start_position, end_position = NO_ANS, NO_ANS # For existing answers, find answers if exist elif return_answers: # Get token-level start/end position tok_start_position = word_to_tok_index[example.start_position] if example.end_position < len(example.doc_words) - 1: tok_end_position = word_to_tok_index[ example.end_position + 1] - 1 # By backwarding from next word else: assert example.end_position == len(example.doc_words) - 1 tok_end_position = len(all_doc_tokens) - 1 # Improve answer span by subword-level (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text) # Throw away training samples without answers (due to doc_span split) doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 if (tok_start_position < doc_start or tok_end_position < doc_start or tok_start_position > doc_end or tok_end_position > doc_end): if skip_no_answer: continue else: # For NQ, only add this in 2% (50 times downsample) if save_with_prob: if np.random.randint(100) < 2: start_position, end_position = NO_ANS, NO_ANS else: continue else: start_position, end_position = NO_ANS, NO_ANS # Training samples with answers else: doc_offset = 1 # For [CLS] start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset assert start_position >= 0 and end_position >= 0, ( start_position, end_position) out_tokens = [] # doc out_tokens_ = [] # quesry out_tokens.append("[CLS]") out_tokens_.append("[CLS]") token_to_word_map = { } # The difference with tok_to_word_index is it includes special tokens token_is_max_context = {} # For query tokens, just copy and add [SEP] for token in query_tokens: out_tokens_.append(token) out_tokens_.append("[SEP]") # For each doc token, create token_to_word_map and is_max_context, and add to out_tokens for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_word_map[len( out_tokens)] = tok_to_word_index[split_token_index] is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) token_is_max_context[len(out_tokens)] = is_max_context out_tokens.append(all_doc_tokens[split_token_index]) out_tokens.append("[SEP]") # Convert to ids and masks input_ids = tokenizer.convert_tokens_to_ids(out_tokens) input_ids_ = tokenizer.convert_tokens_to_ids(out_tokens_) input_mask = [1] * len(input_ids) input_mask_ = [1] * len(input_ids_) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length while len( input_ids_) < max_query_length + 2: # +2 for [CLS], [SEP] input_ids_.append(0) input_mask_.append(0) assert len(input_ids_) == max_query_length + 2 assert len(input_mask_) == max_query_length + 2 # Printing for debug if example_index < 1 and verbose: logger.info("*** Example ***") logger.info("unique_id: %s" % (unique_id)) logger.info("example_index: %s" % (example_index)) logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens])) logger.info("q tokens: %s" % " ".join( [tokenization.printable_text(x) for x in out_tokens_])) logger.info("token_to_word_map: %s" % " ".join([ "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_word_map) ])) logger.info("token_is_max_context: %s" % " ".join([ "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) ])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) if return_answers: answer_text = " ".join( out_tokens[start_position:(end_position + 1)]) logger.info("start_position: %d" % (start_position)) logger.info("end_position: %d" % (end_position)) logger.info("answer: %s" % (tokenization.printable_text(answer_text))) # Append feature features.append( ContextFeatures(unique_id=unique_id, example_index=example_index, doc_span_index=doc_span_index, tokens=out_tokens, token_to_word_map=token_to_word_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, start_position=start_position, end_position=end_position)) question_features.append( QuestionFeatures(unique_id=unique_id, example_index=example_index, tokens_=out_tokens_, input_ids=input_ids_, input_mask=input_mask_)) # Check validity of answer if return_answers: if start_position <= NO_ANS: assert start_position == NO_ANS and end_position == NO_ANS, ( start_position, end_position) else: assert out_tokens[start_position:end_position+1] == \ all_doc_tokens[tok_start_position:tok_end_position+1] orig_text, start_pos, end_pos = get_final_text_( example, features[-1], start_position, end_position, True, False) phrase = orig_text[start_pos:end_pos] try: assert phrase == example.orig_answer_text except Exception as e: # print('diff ans [%s]/[%s]'%(phrase, example.orig_answer_text)) pass unique_id += 1 return features, question_features