Exemplo n.º 1
0
    def featurize(self, tokenizer, feat_spec):
        special_tokens_count = 2  # CLS, SEP

        (tokens, ) = truncate_sequences(
            tokens_ls=[self.tokens],
            max_length=feat_spec.max_seq_length - special_tokens_count,
        )

        unpadded_tokens = tokens + [tokenizer.sep_token]
        unpadded_segment_ids = [feat_spec.sequence_a_segment_id
                                ] * (len(tokens) + 1)

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        # exclusive spans are converted to inclusive spans for use with SelfAttentiveSpanExtractor
        span1_span = ExclusiveSpan(
            start=self.span1_span[0] + unpadded_inputs.cls_offset,
            end=self.span1_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        span2_span = ExclusiveSpan(
            start=self.span2_span[0] + unpadded_inputs.cls_offset,
            end=self.span2_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        assert span1_span.end <= len(
            tokens
        ), "Span 1 spans beyond max_seq_len, consider raising max_seq_len"
        assert span2_span.end <= len(
            tokens
        ), "Span 2 spans beyond max_seq_len, consider raising max_seq_len"

        binary_label_ids = np.zeros((self.label_num, ), dtype=int)
        for label_id in self.label_ids:
            binary_label_ids[label_id] = 1

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([span1_span, span2_span]),
            label_ids=binary_label_ids,
            tokens=unpadded_inputs.unpadded_tokens,
            span1_text=self.span1_text,
            span2_text=self.span2_text,
        )
Exemplo n.º 2
0
    def featurize(self, tokenizer, feat_spec):

        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 4  # CLS, SEP-SEP, SEP
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 3  # CLS, SEP, SEP

        input_set_ls = []
        unpadded_inputs_ls = []
        for choice in self.choice_list:
            prompt, choice = truncate_sequences(
                tokens_ls=[self.prompt, choice],
                max_length=feat_spec.max_seq_length - special_tokens_count,
                truncate_end=False,
            )
            unpadded_inputs = add_cls_token(
                unpadded_tokens=(
                    # prompt
                    prompt + [tokenizer.sep_token] + maybe_extra_sep
                    # choice
                    + choice + [tokenizer.sep_token]),
                unpadded_segment_ids=(
                    # prompt
                    [feat_spec.sequence_a_segment_id] * (len(prompt) + 1) +
                    maybe_extra_sep_segment_id
                    # choice + sep
                    + [feat_spec.sequence_b_segment_id] * (len(choice) + 1)),
                tokenizer=tokenizer,
                feat_spec=feat_spec,
            )
            input_set = create_input_set_from_tokens_and_segments(
                unpadded_tokens=unpadded_inputs.unpadded_tokens,
                unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
                tokenizer=tokenizer,
                feat_spec=feat_spec,
            )
            input_set_ls.append(input_set)
            unpadded_inputs_ls.append(unpadded_inputs)

        return DataRow(
            guid=self.guid,
            input_ids=np.stack(
                [input_set.input_ids for input_set in input_set_ls]),
            input_mask=np.stack(
                [input_set.input_mask for input_set in input_set_ls]),
            segment_ids=np.stack(
                [input_set.segment_ids for input_set in input_set_ls]),
            label_id=self.label_id,
            tokens_list=[
                unpadded_inputs.unpadded_tokens
                for unpadded_inputs in unpadded_inputs_ls
            ],
        )
Exemplo n.º 3
0
    def featurize(self, tokenizer, feat_spec):

        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 4
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 3

        paragraph = truncate_sequences(
            tokens_ls=[self.paragraph],
            max_length=(
                feat_spec.max_seq_length
                - special_tokens_count
                - len(self.question)
                - len(self.answer)
            ),
        )[0]
        unpadded_inputs = add_cls_token(
            unpadded_tokens=(
                paragraph
                + self.question
                + [tokenizer.sep_token]
                + maybe_extra_sep
                + self.answer
                + [tokenizer.sep_token]
            ),
            unpadded_segment_ids=(
                [feat_spec.sequence_a_segment_id] * len(paragraph)
                + [feat_spec.sequence_a_segment_id] * (len(self.question) + 1)
                + maybe_extra_sep_segment_id
                + [feat_spec.sequence_b_segment_id] * (len(self.answer) + 1)
            ),
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            label_id=self.label_id,
            tokens=unpadded_inputs.unpadded_tokens,
            question_id=self.question_id,
        )
Exemplo n.º 4
0
    def featurize(self, tokenizer, feat_spec):
        unpadded_inputs = construct_single_input_tokens_and_segment_ids(
            input_tokens=self.text,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        # Replicate padding / additional tokens for the label ids and mask
        if feat_spec.sep_token_extra:
            label_suffix = [None, None]
            mask_suffix = [0, 0]
            special_tokens_count = 3  # CLS, SEP-SEP
        else:
            label_suffix = [None]
            mask_suffix = [0]
            special_tokens_count = 2  # CLS, SEP
        unpadded_labels = (
            [None] +
            self.labels[:feat_spec.max_seq_length - special_tokens_count] +
            label_suffix)
        unpadded_labels = [i if i is not None else -1 for i in unpadded_labels]
        unpadded_label_mask = (
            [0] +
            self.label_mask[:feat_spec.max_seq_length - special_tokens_count] +
            mask_suffix)

        padded_labels = pad_single_with_feat_spec(
            ls=unpadded_labels,
            feat_spec=feat_spec,
            pad_idx=-1,
        )
        padded_label_mask = pad_single_with_feat_spec(
            ls=unpadded_label_mask,
            feat_spec=feat_spec,
            pad_idx=0,
        )

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            label_ids=np.array(padded_labels),
            label_mask=np.array(padded_label_mask),
            tokens=unpadded_inputs.unpadded_tokens,
        )
Exemplo n.º 5
0
 def featurize(self, tokenizer, feat_spec):
     # Handle masked_tokens
     unpadded_masked_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.masked_tokens,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     masked_input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_masked_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_masked_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     # Handle label_tokens
     special_tokens_count = 2  # CLS, SEP
     pad_token = tokenizer.pad_token
     (unpadded_label_tokens, ) = truncate_sequences(
         tokens_ls=[self.label_tokens],
         max_length=feat_spec.max_seq_length - special_tokens_count,
     )
     if feat_spec.cls_token_at_end:
         unpadded_label_tokens = unpadded_label_tokens + [
             pad_token, pad_token
         ]
     else:
         unpadded_label_tokens = [pad_token
                                  ] + unpadded_label_tokens + [pad_token]
     unpadded_label_token_ids = tokenizer.convert_tokens_to_ids(
         unpadded_label_tokens)
     masked_lm_labels = pad_single_with_feat_spec(
         ls=unpadded_label_token_ids,
         feat_spec=feat_spec,
         pad_idx=feat_spec.pad_token_id,
     )
     masked_lm_labels = np.array(masked_lm_labels)
     masked_lm_labels[masked_lm_labels == feat_spec.
                      pad_token_id] = mlm_template.NON_MASKED_TOKEN_LABEL_ID
     return DataRow(
         guid=self.guid,
         masked_input_ids=np.array(masked_input_set.input_ids),
         input_mask=np.array(masked_input_set.input_mask),
         segment_ids=np.array(masked_input_set.segment_ids),
         masked_lm_labels=masked_lm_labels,
         masked_tokens=unpadded_masked_inputs.unpadded_tokens,
         label_tokens=unpadded_label_tokens,
     )
Exemplo n.º 6
0
    def featurize(self, tokenizer, feat_spec):
        special_tokens_count = 2  # CLS, SEP

        (tokens, ) = truncate_sequences(
            tokens_ls=[self.tokens],
            max_length=feat_spec.max_seq_length - special_tokens_count,
        )

        unpadded_tokens = tokens + [tokenizer.sep_token]
        unpadded_segment_ids = [feat_spec.sequence_a_segment_id
                                ] * (len(self.tokens) + 1)

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        span1_span = ExclusiveSpan(
            start=self.span1_span[0] + unpadded_inputs.cls_offset,
            end=self.span1_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()
        span2_span = ExclusiveSpan(
            start=self.span2_span[0] + unpadded_inputs.cls_offset,
            end=self.span2_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([span1_span, span2_span]),
            label_id=self.label_id,
            tokens=unpadded_inputs.unpadded_tokens,
            span1_text=self.span1_text,
            span2_text=self.span2_text,
        )
Exemplo n.º 7
0
 def featurize(self, tokenizer, feat_spec):
     unpadded_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.text_tokens, tokenizer=tokenizer, feat_spec=feat_spec,
     )
     input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     return DataRow(
         guid=self.guid,
         input_ids=np.array(input_set.input_ids),
         input_mask=np.array(input_set.input_mask),
         segment_ids=np.array(input_set.segment_ids),
         is_english=self.is_english,
         tokens=unpadded_inputs.unpadded_tokens,
     )
Exemplo n.º 8
0
Arquivo: mlm.py Projeto: zphang/jiant
 def featurize(self, tokenizer, feat_spec):
     unpadded_inputs = construct_single_input_tokens_and_segment_ids(
         input_tokens=self.input_tokens,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     return DataRow(
         guid=self.guid,
         input_ids=np.array(input_set.input_ids),
         input_mask=np.array(input_set.input_mask),
         segment_ids=np.array(input_set.segment_ids),
         # Masking will be performed on the fly in train
         tokens=unpadded_inputs.unpadded_tokens,
     )
Exemplo n.º 9
0
 def featurize(self, tokenizer, feat_spec):
     # Label not label_id, otherwise we can use double_sentence_featurize
     unpadded_inputs = construct_double_input_tokens_and_segment_ids(
         input_tokens_a=self.text_a,
         input_tokens_b=self.text_b,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     input_set = create_input_set_from_tokens_and_segments(
         unpadded_tokens=unpadded_inputs.unpadded_tokens,
         unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
         tokenizer=tokenizer,
         feat_spec=feat_spec,
     )
     return DataRow(
         guid=self.guid,
         input_ids=np.array(input_set.input_ids),
         input_mask=np.array(input_set.input_mask),
         segment_ids=np.array(input_set.segment_ids),
         label=self.label,
         tokens=unpadded_inputs.unpadded_tokens,
     )
Exemplo n.º 10
0
    def featurize(self, tokenizer, feat_spec):
        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 6  # CLS, SEP-SEP, SEP-SEP, SEP
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 4  # CLS, SEP, SEP, SEP

        input_obs1_a, input_hyp1_a, input_obs2_a = truncate_sequences(
            tokens_ls=[self.input_obs1, self.input_hyp1, self.input_obs2],
            max_length=feat_spec.max_seq_length - special_tokens_count - 1,
            # -1 for self.question
        )
        input_obs1_b, input_hyp2_b, input_obs2_b = truncate_sequences(
            tokens_ls=[self.input_obs1, self.input_hyp2, self.input_obs2],
            max_length=feat_spec.max_seq_length - special_tokens_count - 1,
            # -1 for self.question
        )

        unpadded_inputs_1 = add_cls_token(
            unpadded_tokens=(input_obs1_a + [tokenizer.sep_token] +
                             maybe_extra_sep + input_hyp1_a +
                             [tokenizer.sep_token] + maybe_extra_sep +
                             input_obs2_a + [tokenizer.sep_token]),
            unpadded_segment_ids=(
                # question + sep(s)
                [feat_spec.sequence_a_segment_id] * (len(input_obs1_a) + 1) +
                maybe_extra_sep_segment_id
                # premise + sep(s)
                + [feat_spec.sequence_a_segment_id] * (len(input_hyp1_a) + 1) +
                maybe_extra_sep_segment_id
                # choice + sep
                + [feat_spec.sequence_b_segment_id] * (len(input_obs2_a) + 1)),
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        unpadded_inputs_2 = add_cls_token(
            unpadded_tokens=(input_obs1_b + [tokenizer.sep_token] +
                             maybe_extra_sep + input_hyp2_b +
                             [tokenizer.sep_token] + maybe_extra_sep +
                             input_obs2_b + [tokenizer.sep_token]),
            unpadded_segment_ids=(
                # question + sep(s)
                [feat_spec.sequence_a_segment_id] * (len(input_obs1_b) + 1) +
                maybe_extra_sep_segment_id
                # premise + sep(s)
                + [feat_spec.sequence_a_segment_id] * (len(input_hyp2_b) + 1) +
                maybe_extra_sep_segment_id
                # choice + sep
                + [feat_spec.sequence_b_segment_id] * (len(input_obs2_b) + 1)),
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set1 = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs_1.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs_1.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        input_set2 = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs_2.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs_2.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        return DataRow(
            guid=self.guid,
            input_ids=np.stack([input_set1.input_ids, input_set2.input_ids]),
            input_mask=np.stack([input_set1.input_mask,
                                 input_set2.input_mask]),
            segment_ids=np.stack(
                [input_set1.segment_ids, input_set2.segment_ids]),
            label_id=self.label_id,
            tokens1=unpadded_inputs_1.unpadded_tokens,
            tokens2=unpadded_inputs_2.unpadded_tokens,
        )
Exemplo n.º 11
0
    def featurize(self, tokenizer, feat_spec):
        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 6  # CLS, SEP-SEP, SEP-SEP, SEP
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 4  # CLS, SEP, SEP, SEP

        sentence1_tokens, sentence2_tokens = truncate_sequences(
            tokens_ls=[self.sentence1_tokens, self.sentence2_tokens],
            max_length=feat_spec.max_seq_length - len(self.word) -
            special_tokens_count,
        )

        unpadded_tokens = (self.word + [tokenizer.sep_token] +
                           maybe_extra_sep + sentence1_tokens +
                           [tokenizer.sep_token] + maybe_extra_sep +
                           sentence2_tokens + [tokenizer.sep_token])
        # Don't have a choice here -- just leave words as part of sent1
        unpadded_segment_ids = (
            [feat_spec.sequence_a_segment_id] * (len(self.word) + 1) +
            maybe_extra_sep_segment_id + [feat_spec.sequence_a_segment_id] *
            (len(sentence1_tokens) + 1) + maybe_extra_sep_segment_id +
            [feat_spec.sequence_b_segment_id] * (len(sentence2_tokens) + 1))

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        word_sep_offset = 2 if feat_spec.sep_token_extra else 1
        sent1_sep_offset = 2 if feat_spec.sep_token_extra else 1

        # Both should be inclusive spans at the end
        sentence1_span = ExclusiveSpan(
            start=self.sentence1_span[0] + unpadded_inputs.cls_offset +
            word_sep_offset + len(self.word),
            end=self.sentence1_span[1] + unpadded_inputs.cls_offset +
            word_sep_offset + len(self.word),
        ).to_inclusive()
        sentence2_span = ExclusiveSpan(
            start=self.sentence2_span[0] + unpadded_inputs.cls_offset +
            word_sep_offset + sent1_sep_offset + len(self.word) +
            len(sentence1_tokens),
            end=self.sentence2_span[1] + unpadded_inputs.cls_offset +
            word_sep_offset + sent1_sep_offset + len(self.word) +
            len(sentence1_tokens),
        ).to_inclusive()

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([sentence1_span, sentence2_span]),
            label_id=self.label_id,
            tokens=unpadded_inputs.unpadded_tokens,
            word=self.word,
        )
Exemplo n.º 12
0
    def featurize(self, tokenizer, feat_spec):

        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 4  # CLS, SEP-SEP, SEP
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 3  # CLS, SEP, SEP

        passage, question = truncate_sequences(
            tokens_ls=[self.passage, self.question],
            max_length=feat_spec.max_seq_length - special_tokens_count,
        )
        assert (
            len(passage) >= self.answer_token_span[1]
        ), f"Answer span {self.answer_token_span} truncated, please raise max_seq_length."
        unpadded_inputs = add_cls_token(
            unpadded_tokens=(
                passage + [tokenizer.sep_token] + maybe_extra_sep + question + [tokenizer.sep_token]
            ),
            unpadded_segment_ids=(
                [feat_spec.sequence_a_segment_id] * (len(passage) + 1)
                + maybe_extra_sep_segment_id
                + [feat_spec.sequence_b_segment_id] * (len(question) + 1)
            ),
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        gt_span_idxs = list(map(lambda x: x + unpadded_inputs.cls_offset, self.answer_token_span))
        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        pred_span_mask = pad_to_max_seq_length(
            ls=[0] * unpadded_inputs.cls_offset + [1] * len(passage),
            max_seq_length=feat_spec.max_seq_length,
            pad_idx=0,
            pad_right=not feat_spec.pad_on_left,
        )
        token_idx_to_char_idx_start = pad_to_max_seq_length(
            ls=[-1] * unpadded_inputs.cls_offset
            + (self.token_idx_to_char_idx_map > 0).argmax(axis=1).tolist()[: len(passage)],
            max_seq_length=feat_spec.max_seq_length,
            pad_idx=-1,
            pad_right=not feat_spec.pad_on_left,
        )
        token_idx_to_char_idx_end = pad_to_max_seq_length(
            ls=[-1] * unpadded_inputs.cls_offset
            + self.token_idx_to_char_idx_map.cumsum(axis=1).argmax(axis=1).tolist()[: len(passage)],
            max_seq_length=feat_spec.max_seq_length,
            pad_idx=-1,
            pad_right=not feat_spec.pad_on_left,
        )
        # When there are multiple greatest elements, argmax will return the index of the first one.
        # So, (x > 0).argmax() will return the index of the first non-zero element in an array,
        # token_idx_to_char_idx_start is computed in this way to map each token index to the
        # beginning char index of that token. On the other side, x.cumsum().argmax() will return
        # the index of the last non-zero element in an array, token_idx_to_char_idx_end is
        # computed in this way to map each token index to ending char index.
        # Once the model predict a span over the token index, these to mapping will help to project
        # the span back to char index, and slice the predicted answer string from the input text.

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            gt_span_str=self.answer_str,
            gt_span_idxs=np.array(gt_span_idxs),
            selection_str=self.passage_str,
            selection_token_mask=np.array(pred_span_mask),
            token_idx_to_char_idx_start=np.array(token_idx_to_char_idx_start),
            token_idx_to_char_idx_end=np.array(token_idx_to_char_idx_end),
        )