def featurize(self, tokenizer, feat_spec): special_tokens_count = 2 # CLS, SEP (tokens, ) = truncate_sequences( tokens_ls=[self.tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) unpadded_tokens = tokens + [tokenizer.sep_token] unpadded_segment_ids = [feat_spec.sequence_a_segment_id ] * (len(tokens) + 1) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) # exclusive spans are converted to inclusive spans for use with SelfAttentiveSpanExtractor span1_span = ExclusiveSpan( start=self.span1_span[0] + unpadded_inputs.cls_offset, end=self.span1_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() span2_span = ExclusiveSpan( start=self.span2_span[0] + unpadded_inputs.cls_offset, end=self.span2_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() assert span1_span.end <= len( tokens ), "Span 1 spans beyond max_seq_len, consider raising max_seq_len" assert span2_span.end <= len( tokens ), "Span 2 spans beyond max_seq_len, consider raising max_seq_len" binary_label_ids = np.zeros((self.label_num, ), dtype=int) for label_id in self.label_ids: binary_label_ids[label_id] = 1 return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([span1_span, span2_span]), label_ids=binary_label_ids, tokens=unpadded_inputs.unpadded_tokens, span1_text=self.span1_text, span2_text=self.span2_text, )
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 4 # CLS, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 3 # CLS, SEP, SEP input_set_ls = [] unpadded_inputs_ls = [] for choice in self.choice_list: prompt, choice = truncate_sequences( tokens_ls=[self.prompt, choice], max_length=feat_spec.max_seq_length - special_tokens_count, truncate_end=False, ) unpadded_inputs = add_cls_token( unpadded_tokens=( # prompt prompt + [tokenizer.sep_token] + maybe_extra_sep # choice + choice + [tokenizer.sep_token]), unpadded_segment_ids=( # prompt [feat_spec.sequence_a_segment_id] * (len(prompt) + 1) + maybe_extra_sep_segment_id # choice + sep + [feat_spec.sequence_b_segment_id] * (len(choice) + 1)), tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set_ls.append(input_set) unpadded_inputs_ls.append(unpadded_inputs) return DataRow( guid=self.guid, input_ids=np.stack( [input_set.input_ids for input_set in input_set_ls]), input_mask=np.stack( [input_set.input_mask for input_set in input_set_ls]), segment_ids=np.stack( [input_set.segment_ids for input_set in input_set_ls]), label_id=self.label_id, tokens_list=[ unpadded_inputs.unpadded_tokens for unpadded_inputs in unpadded_inputs_ls ], )
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 4 else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 3 paragraph = truncate_sequences( tokens_ls=[self.paragraph], max_length=( feat_spec.max_seq_length - special_tokens_count - len(self.question) - len(self.answer) ), )[0] unpadded_inputs = add_cls_token( unpadded_tokens=( paragraph + self.question + [tokenizer.sep_token] + maybe_extra_sep + self.answer + [tokenizer.sep_token] ), unpadded_segment_ids=( [feat_spec.sequence_a_segment_id] * len(paragraph) + [feat_spec.sequence_a_segment_id] * (len(self.question) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_b_segment_id] * (len(self.answer) + 1) ), tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), label_id=self.label_id, tokens=unpadded_inputs.unpadded_tokens, question_id=self.question_id, )
def construct_single_input_tokens_and_segment_ids( input_tokens: List[str], tokenizer, feat_spec: FeaturizationSpec): special_tokens_count = 2 # CLS, SEP (input_tokens, ) = truncate_sequences( tokens_ls=[input_tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) return add_cls_token( unpadded_tokens=input_tokens + [tokenizer.sep_token], unpadded_segment_ids=([feat_spec.sequence_a_segment_id] + [feat_spec.sequence_a_segment_id] * (len(input_tokens))), tokenizer=tokenizer, feat_spec=feat_spec, )
def featurize(self, tokenizer, feat_spec): # Handle masked_tokens unpadded_masked_inputs = construct_single_input_tokens_and_segment_ids( input_tokens=self.masked_tokens, tokenizer=tokenizer, feat_spec=feat_spec, ) masked_input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_masked_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_masked_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) # Handle label_tokens special_tokens_count = 2 # CLS, SEP pad_token = tokenizer.pad_token (unpadded_label_tokens, ) = truncate_sequences( tokens_ls=[self.label_tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) if feat_spec.cls_token_at_end: unpadded_label_tokens = unpadded_label_tokens + [ pad_token, pad_token ] else: unpadded_label_tokens = [pad_token ] + unpadded_label_tokens + [pad_token] unpadded_label_token_ids = tokenizer.convert_tokens_to_ids( unpadded_label_tokens) masked_lm_labels = pad_single_with_feat_spec( ls=unpadded_label_token_ids, feat_spec=feat_spec, pad_idx=feat_spec.pad_token_id, ) masked_lm_labels = np.array(masked_lm_labels) masked_lm_labels[masked_lm_labels == feat_spec. pad_token_id] = mlm_template.NON_MASKED_TOKEN_LABEL_ID return DataRow( guid=self.guid, masked_input_ids=np.array(masked_input_set.input_ids), input_mask=np.array(masked_input_set.input_mask), segment_ids=np.array(masked_input_set.segment_ids), masked_lm_labels=masked_lm_labels, masked_tokens=unpadded_masked_inputs.unpadded_tokens, label_tokens=unpadded_label_tokens, )
def featurize(self, tokenizer, feat_spec): special_tokens_count = 2 # CLS, SEP (tokens, ) = truncate_sequences( tokens_ls=[self.tokens], max_length=feat_spec.max_seq_length - special_tokens_count, ) unpadded_tokens = tokens + [tokenizer.sep_token] unpadded_segment_ids = [feat_spec.sequence_a_segment_id ] * (len(self.tokens) + 1) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) span1_span = ExclusiveSpan( start=self.span1_span[0] + unpadded_inputs.cls_offset, end=self.span1_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() span2_span = ExclusiveSpan( start=self.span2_span[0] + unpadded_inputs.cls_offset, end=self.span2_span[1] + unpadded_inputs.cls_offset, ).to_inclusive() return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([span1_span, span2_span]), label_id=self.label_id, tokens=unpadded_inputs.unpadded_tokens, span1_text=self.span1_text, span2_text=self.span2_text, )
def construct_double_input_tokens_and_segment_ids( input_tokens_a: List[str], input_tokens_b: List[str], tokenizer, feat_spec: FeaturizationSpec): """Create token and segment id sequences, apply truncation, add separator and class tokens. Args: input_tokens_a (List[str]): sequence of tokens in segment a. input_tokens_b (List[str]): sequence of tokens in segment b. tokenizer: feat_spec (FeaturizationSpec): Tokenization-related metadata. Returns: UnpaddedInputs: unpadded inputs with truncation applied and special tokens appended. """ if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 4 # CLS, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 3 # CLS, SEP, SEP input_tokens_a, input_tokens_b = truncate_sequences( tokens_ls=[input_tokens_a, input_tokens_b], max_length=feat_spec.max_seq_length - special_tokens_count, ) unpadded_tokens = (input_tokens_a + [tokenizer.sep_token] + maybe_extra_sep + input_tokens_b + [tokenizer.sep_token]) unpadded_segment_ids = ( [feat_spec.sequence_a_segment_id] * len(input_tokens_a) + [feat_spec.sequence_a_segment_id] + maybe_extra_sep_segment_id + [feat_spec.sequence_b_segment_id] * len(input_tokens_b) + [feat_spec.sequence_b_segment_id]) return add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, )
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 6 # CLS, SEP-SEP, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 4 # CLS, SEP, SEP, SEP input_obs1_a, input_hyp1_a, input_obs2_a = truncate_sequences( tokens_ls=[self.input_obs1, self.input_hyp1, self.input_obs2], max_length=feat_spec.max_seq_length - special_tokens_count - 1, # -1 for self.question ) input_obs1_b, input_hyp2_b, input_obs2_b = truncate_sequences( tokens_ls=[self.input_obs1, self.input_hyp2, self.input_obs2], max_length=feat_spec.max_seq_length - special_tokens_count - 1, # -1 for self.question ) unpadded_inputs_1 = add_cls_token( unpadded_tokens=(input_obs1_a + [tokenizer.sep_token] + maybe_extra_sep + input_hyp1_a + [tokenizer.sep_token] + maybe_extra_sep + input_obs2_a + [tokenizer.sep_token]), unpadded_segment_ids=( # question + sep(s) [feat_spec.sequence_a_segment_id] * (len(input_obs1_a) + 1) + maybe_extra_sep_segment_id # premise + sep(s) + [feat_spec.sequence_a_segment_id] * (len(input_hyp1_a) + 1) + maybe_extra_sep_segment_id # choice + sep + [feat_spec.sequence_b_segment_id] * (len(input_obs2_a) + 1)), tokenizer=tokenizer, feat_spec=feat_spec, ) unpadded_inputs_2 = add_cls_token( unpadded_tokens=(input_obs1_b + [tokenizer.sep_token] + maybe_extra_sep + input_hyp2_b + [tokenizer.sep_token] + maybe_extra_sep + input_obs2_b + [tokenizer.sep_token]), unpadded_segment_ids=( # question + sep(s) [feat_spec.sequence_a_segment_id] * (len(input_obs1_b) + 1) + maybe_extra_sep_segment_id # premise + sep(s) + [feat_spec.sequence_a_segment_id] * (len(input_hyp2_b) + 1) + maybe_extra_sep_segment_id # choice + sep + [feat_spec.sequence_b_segment_id] * (len(input_obs2_b) + 1)), tokenizer=tokenizer, feat_spec=feat_spec, ) input_set1 = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs_1.unpadded_tokens, unpadded_segment_ids=unpadded_inputs_1.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set2 = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs_2.unpadded_tokens, unpadded_segment_ids=unpadded_inputs_2.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) return DataRow( guid=self.guid, input_ids=np.stack([input_set1.input_ids, input_set2.input_ids]), input_mask=np.stack([input_set1.input_mask, input_set2.input_mask]), segment_ids=np.stack( [input_set1.segment_ids, input_set2.segment_ids]), label_id=self.label_id, tokens1=unpadded_inputs_1.unpadded_tokens, tokens2=unpadded_inputs_2.unpadded_tokens, )
def test_truncate_single_sequence_default_trunc_end(): seq = [["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]] trunc_seq = truncate_sequences(seq, 8) assert trunc_seq == [[ "abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx" ]]
def test_truncate_two_sequences_default_trunc_start(): seqs = [["abc", "def", "ghi", "jkl"], ["mno", "pqr", "stu", "vwx", "yz"]] trunc_seqs = truncate_sequences(seqs, 8, False) assert trunc_seqs == [["abc", "def", "ghi", "jkl"], ["pqr", "stu", "vwx", "yz"]]
def test_truncate_more_than_two_sequences_trunc_start(): seqs = [["abc", "def", "ghi"], ["jkl", "mno", "pqr"], ["stu", "vwx", "yz"]] trunc_seqs = truncate_sequences(seqs, 8) assert trunc_seqs == [["abc", "def"], ["jkl", "mno", "pqr"], ["stu", "vwx", "yz"]]
def test_truncate_empty_sequence(): seq = [] trunc_seq = truncate_sequences(seq, 10) assert not trunc_seq
def test_truncate_single_sequence_trunc_start(): seq = [["abc", "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz"]] trunc_seq = truncate_sequences(seq, 8, False) assert trunc_seq == [[ "def", "ghi", "jkl", "mno", "pqr", "stu", "vwx", "yz" ]]
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 6 # CLS, SEP-SEP, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 4 # CLS, SEP, SEP, SEP sentence1_tokens, sentence2_tokens = truncate_sequences( tokens_ls=[self.sentence1_tokens, self.sentence2_tokens], max_length=feat_spec.max_seq_length - len(self.word) - special_tokens_count, ) unpadded_tokens = (self.word + [tokenizer.sep_token] + maybe_extra_sep + sentence1_tokens + [tokenizer.sep_token] + maybe_extra_sep + sentence2_tokens + [tokenizer.sep_token]) # Don't have a choice here -- just leave words as part of sent1 unpadded_segment_ids = ( [feat_spec.sequence_a_segment_id] * (len(self.word) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_a_segment_id] * (len(sentence1_tokens) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_b_segment_id] * (len(sentence2_tokens) + 1)) unpadded_inputs = add_cls_token( unpadded_tokens=unpadded_tokens, unpadded_segment_ids=unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) word_sep_offset = 2 if feat_spec.sep_token_extra else 1 sent1_sep_offset = 2 if feat_spec.sep_token_extra else 1 # Both should be inclusive spans at the end sentence1_span = ExclusiveSpan( start=self.sentence1_span[0] + unpadded_inputs.cls_offset + word_sep_offset + len(self.word), end=self.sentence1_span[1] + unpadded_inputs.cls_offset + word_sep_offset + len(self.word), ).to_inclusive() sentence2_span = ExclusiveSpan( start=self.sentence2_span[0] + unpadded_inputs.cls_offset + word_sep_offset + sent1_sep_offset + len(self.word) + len(sentence1_tokens), end=self.sentence2_span[1] + unpadded_inputs.cls_offset + word_sep_offset + sent1_sep_offset + len(self.word) + len(sentence1_tokens), ).to_inclusive() return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), spans=np.array([sentence1_span, sentence2_span]), label_id=self.label_id, tokens=unpadded_inputs.unpadded_tokens, word=self.word, )
def featurize(self, tokenizer, feat_spec): if feat_spec.sep_token_extra: maybe_extra_sep = [tokenizer.sep_token] maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id] special_tokens_count = 4 # CLS, SEP-SEP, SEP else: maybe_extra_sep = [] maybe_extra_sep_segment_id = [] special_tokens_count = 3 # CLS, SEP, SEP passage, question = truncate_sequences( tokens_ls=[self.passage, self.question], max_length=feat_spec.max_seq_length - special_tokens_count, ) assert ( len(passage) >= self.answer_token_span[1] ), f"Answer span {self.answer_token_span} truncated, please raise max_seq_length." unpadded_inputs = add_cls_token( unpadded_tokens=( passage + [tokenizer.sep_token] + maybe_extra_sep + question + [tokenizer.sep_token] ), unpadded_segment_ids=( [feat_spec.sequence_a_segment_id] * (len(passage) + 1) + maybe_extra_sep_segment_id + [feat_spec.sequence_b_segment_id] * (len(question) + 1) ), tokenizer=tokenizer, feat_spec=feat_spec, ) gt_span_idxs = list(map(lambda x: x + unpadded_inputs.cls_offset, self.answer_token_span)) input_set = create_input_set_from_tokens_and_segments( unpadded_tokens=unpadded_inputs.unpadded_tokens, unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids, tokenizer=tokenizer, feat_spec=feat_spec, ) pred_span_mask = pad_to_max_seq_length( ls=[0] * unpadded_inputs.cls_offset + [1] * len(passage), max_seq_length=feat_spec.max_seq_length, pad_idx=0, pad_right=not feat_spec.pad_on_left, ) token_idx_to_char_idx_start = pad_to_max_seq_length( ls=[-1] * unpadded_inputs.cls_offset + (self.token_idx_to_char_idx_map > 0).argmax(axis=1).tolist()[: len(passage)], max_seq_length=feat_spec.max_seq_length, pad_idx=-1, pad_right=not feat_spec.pad_on_left, ) token_idx_to_char_idx_end = pad_to_max_seq_length( ls=[-1] * unpadded_inputs.cls_offset + self.token_idx_to_char_idx_map.cumsum(axis=1).argmax(axis=1).tolist()[: len(passage)], max_seq_length=feat_spec.max_seq_length, pad_idx=-1, pad_right=not feat_spec.pad_on_left, ) # When there are multiple greatest elements, argmax will return the index of the first one. # So, (x > 0).argmax() will return the index of the first non-zero element in an array, # token_idx_to_char_idx_start is computed in this way to map each token index to the # beginning char index of that token. On the other side, x.cumsum().argmax() will return # the index of the last non-zero element in an array, token_idx_to_char_idx_end is # computed in this way to map each token index to ending char index. # Once the model predict a span over the token index, these to mapping will help to project # the span back to char index, and slice the predicted answer string from the input text. return DataRow( guid=self.guid, input_ids=np.array(input_set.input_ids), input_mask=np.array(input_set.input_mask), segment_ids=np.array(input_set.segment_ids), gt_span_str=self.answer_str, gt_span_idxs=np.array(gt_span_idxs), selection_str=self.passage_str, selection_token_mask=np.array(pred_span_mask), token_idx_to_char_idx_start=np.array(token_idx_to_char_idx_start), token_idx_to_char_idx_end=np.array(token_idx_to_char_idx_end), )