示例#1
0
    def featurize(self, tokenizer, feat_spec):
        special_tokens_count = 2  # CLS, SEP

        (tokens, ) = truncate_sequences(
            tokens_ls=[self.tokens],
            max_length=feat_spec.max_seq_length - special_tokens_count,
        )

        unpadded_tokens = tokens + [tokenizer.sep_token]
        unpadded_segment_ids = [feat_spec.sequence_a_segment_id
                                ] * (len(tokens) + 1)

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        # exclusive spans are converted to inclusive spans for use with SelfAttentiveSpanExtractor
        span1_span = ExclusiveSpan(
            start=self.span1_span[0] + unpadded_inputs.cls_offset,
            end=self.span1_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        span2_span = ExclusiveSpan(
            start=self.span2_span[0] + unpadded_inputs.cls_offset,
            end=self.span2_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        assert span1_span.end <= len(
            tokens
        ), "Span 1 spans beyond max_seq_len, consider raising max_seq_len"
        assert span2_span.end <= len(
            tokens
        ), "Span 2 spans beyond max_seq_len, consider raising max_seq_len"

        binary_label_ids = np.zeros((self.label_num, ), dtype=int)
        for label_id in self.label_ids:
            binary_label_ids[label_id] = 1

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([span1_span, span2_span]),
            label_ids=binary_label_ids,
            tokens=unpadded_inputs.unpadded_tokens,
            span1_text=self.span1_text,
            span2_text=self.span2_text,
        )
def get_token_span(sentence, span: ExclusiveSpan, tokenizer):
    tokenized = tokenizer.tokenize(sentence)
    tokenized_start1 = tokenizer.tokenize(sentence[: span.start])
    tokenized_start2 = tokenizer.tokenize(sentence[: span.end])
    assert starts_with(tokenized, tokenized_start1)
    # assert starts_with(tokenized, tokenized_start2)  # <- fails because of "does" in "doesn't"
    word = sentence[span.to_slice()]
    assert word.lower().replace(" ", "") in delegate_flat_strip(
        tokenized_start2[len(tokenized_start1) :], tokenizer=tokenizer,
    )
    token_span = ExclusiveSpan(start=len(tokenized_start1), end=len(tokenized_start2))
    return tokenized, token_span
示例#3
0
文件: wsc.py 项目: leo-liuzy/jiant
    def featurize(self, tokenizer, feat_spec):
        special_tokens_count = 2  # CLS, SEP

        (tokens, ) = truncate_sequences(
            tokens_ls=[self.tokens],
            max_length=feat_spec.max_seq_length - special_tokens_count,
        )

        unpadded_tokens = tokens + [tokenizer.sep_token]
        unpadded_segment_ids = [feat_spec.sequence_a_segment_id
                                ] * (len(self.tokens) + 1)

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )
        span1_span = ExclusiveSpan(
            start=self.span1_span[0] + unpadded_inputs.cls_offset,
            end=self.span1_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()
        span2_span = ExclusiveSpan(
            start=self.span2_span[0] + unpadded_inputs.cls_offset,
            end=self.span2_span[1] + unpadded_inputs.cls_offset,
        ).to_inclusive()

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([span1_span, span2_span]),
            label_id=self.label_id,
            tokens=unpadded_inputs.unpadded_tokens,
            span1_text=self.span1_text,
            span2_text=self.span2_text,
        )
示例#4
0
 def _create_examples(cls, path, set_type):
     for i, row in enumerate(py_io.read_jsonl(path)):
         yield Example(
             guid="%s-%s" % (set_type, i),
             tokenized_text=row["tokenized_text"],
             masked_spans=[
                 ExclusiveSpan(start, end)
                 for start, end in row["masked_spans"]
             ],
         )
示例#5
0
 def _create_examples(cls, lines, set_type):
     examples = []
     for line in lines:
         span1 = ExclusiveSpan(int(line["start1"]), int(line["end1"]))
         span2 = ExclusiveSpan(int(line["start2"]), int(line["end2"]))
         # Note, the chosen word may be different (e.g. different tenses) in sent1 and sent2,
         #   hence we don't do an assert here.
         examples.append(
             Example(
                 guid="%s-%s" % (set_type, line["idx"]),
                 sentence1=line["sentence1"],
                 sentence2=line["sentence2"],
                 word=line["word"],
                 span1=span1,
                 span2=span2,
                 label=line["label"]
                 if set_type != "test" else cls.LABELS[-1],
             ))
     return examples
示例#6
0
文件: wsc.py 项目: leo-liuzy/jiant
def extract_char_span(full_text, span_text, space_index):
    space_tokens = full_text.split()
    extracted_span_text = space_tokens[space_index]
    assert extracted_span_text.lower() in full_text.lower()
    span_length = len(span_text)
    if space_index == 0:
        start = 0
    else:
        start = len(" ".join(space_tokens[:space_index])) + 1
    # exclusive span
    return ExclusiveSpan(start=start, end=start + span_length)
示例#7
0
文件: wsc.py 项目: leo-liuzy/jiant
 def tokenize(self, tokenizer):
     space_tokenization = self.text.split()
     target_tokenization = tokenizer.tokenize(self.text)
     normed_space_tokenization, normed_target_tokenization = normalize_tokenizations(
         space_tokenization, target_tokenization, tokenizer)
     aligner = retokenize.TokenAligner(normed_space_tokenization,
                                       normed_target_tokenization)
     span1_token_count = len(self.span1_text.split())
     span2_token_count = len(self.span2_text.split())
     target_span1 = ExclusiveSpan(
         *aligner.project_token_span(self.span1_idx, self.span1_idx +
                                     span1_token_count))
     target_span2 = ExclusiveSpan(
         *aligner.project_token_span(self.span2_idx, self.span2_idx +
                                     span2_token_count))
     return TokenizedExample(
         guid=self.guid,
         tokens=target_tokenization,
         span1_span=target_span1,
         span2_span=target_span2,
         span1_text=self.span1_text,
         span2_text=self.span2_text,
         label_id=WSCTask.LABEL_TO_ID[self.label],
     )
示例#8
0
文件: wic.py 项目: zphang/jiant
        def tokenize_span(tokenizer, sentence: str, char_span: ExclusiveSpan):
            """Tokenizes sentence and projects char_span to token span.

            Args:
                tokenizer (transformers.PreTrainedTokenizer): Tokenizer used
                sentence (str): Sentence to be tokenized
                char_span (ExclusiveSpan): character indexed span for sentence

            Returns:
                sentence_target_tokenization (List[str]): tokenized sentence
                target_span (ExclusiveSpane): token span for sentence
            """
            span_start_idx = len(sentence[:char_span.start].split())
            # If the first word in a span starts with punctuation, the first word will
            # erroneously be split into two strings by .split().
            # ie: 'takeaway' -> ["'", "takeaway"]
            # For span alignment, we start the list index at the punctuation.
            if (span_start_idx != 0) and (sentence[:(char_span.start)][-1]
                                          in string.punctuation):
                span_start_idx = span_start_idx - 1
            span_text = sentence[char_span.start:char_span.end]

            sentence_space_tokenization = sentence.split()
            sentence_target_tokenization = tokenizer.tokenize(sentence)
            (
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization,
            ) = normalize_tokenizations(sentence_space_tokenization,
                                        sentence_target_tokenization,
                                        tokenizer)
            span_start_char = len(" ".join(
                sentence_normed_space_tokenization[:span_start_idx]))
            span_text_char = len(span_text)
            aligner = retokenize.TokenAligner(
                sentence_normed_space_tokenization,
                sentence_normed_target_tokenization)
            target_span = ExclusiveSpan(*aligner.project_char_to_token_span(
                span_start_char, span_start_char + span_text_char))
            return sentence_target_tokenization, target_span
示例#9
0
    def featurize(self, tokenizer, feat_spec):
        if feat_spec.sep_token_extra:
            maybe_extra_sep = [tokenizer.sep_token]
            maybe_extra_sep_segment_id = [feat_spec.sequence_a_segment_id]
            special_tokens_count = 6  # CLS, SEP-SEP, SEP-SEP, SEP
        else:
            maybe_extra_sep = []
            maybe_extra_sep_segment_id = []
            special_tokens_count = 4  # CLS, SEP, SEP, SEP

        sentence1_tokens, sentence2_tokens = truncate_sequences(
            tokens_ls=[self.sentence1_tokens, self.sentence2_tokens],
            max_length=feat_spec.max_seq_length - len(self.word) -
            special_tokens_count,
        )

        unpadded_tokens = (self.word + [tokenizer.sep_token] +
                           maybe_extra_sep + sentence1_tokens +
                           [tokenizer.sep_token] + maybe_extra_sep +
                           sentence2_tokens + [tokenizer.sep_token])
        # Don't have a choice here -- just leave words as part of sent1
        unpadded_segment_ids = (
            [feat_spec.sequence_a_segment_id] * (len(self.word) + 1) +
            maybe_extra_sep_segment_id + [feat_spec.sequence_a_segment_id] *
            (len(sentence1_tokens) + 1) + maybe_extra_sep_segment_id +
            [feat_spec.sequence_b_segment_id] * (len(sentence2_tokens) + 1))

        unpadded_inputs = add_cls_token(
            unpadded_tokens=unpadded_tokens,
            unpadded_segment_ids=unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        input_set = create_input_set_from_tokens_and_segments(
            unpadded_tokens=unpadded_inputs.unpadded_tokens,
            unpadded_segment_ids=unpadded_inputs.unpadded_segment_ids,
            tokenizer=tokenizer,
            feat_spec=feat_spec,
        )

        word_sep_offset = 2 if feat_spec.sep_token_extra else 1
        sent1_sep_offset = 2 if feat_spec.sep_token_extra else 1

        # Both should be inclusive spans at the end
        sentence1_span = ExclusiveSpan(
            start=self.sentence1_span[0] + unpadded_inputs.cls_offset +
            word_sep_offset + len(self.word),
            end=self.sentence1_span[1] + unpadded_inputs.cls_offset +
            word_sep_offset + len(self.word),
        ).to_inclusive()
        sentence2_span = ExclusiveSpan(
            start=self.sentence2_span[0] + unpadded_inputs.cls_offset +
            word_sep_offset + sent1_sep_offset + len(self.word) +
            len(sentence1_tokens),
            end=self.sentence2_span[1] + unpadded_inputs.cls_offset +
            word_sep_offset + sent1_sep_offset + len(self.word) +
            len(sentence1_tokens),
        ).to_inclusive()

        return DataRow(
            guid=self.guid,
            input_ids=np.array(input_set.input_ids),
            input_mask=np.array(input_set.input_mask),
            segment_ids=np.array(input_set.segment_ids),
            spans=np.array([sentence1_span, sentence2_span]),
            label_id=self.label_id,
            tokens=unpadded_inputs.unpadded_tokens,
            word=self.word,
        )
示例#10
0
文件: test_wic.py 项目: zphang/jiant
from collections import Counter

from jiant.tasks.utils import ExclusiveSpan
from jiant.tasks.lib.wic import Example, TokenizedExample
from jiant.utils.testing.tokenizer import SimpleSpaceTokenizer


EXAMPLES = [
    Example(
        guid="train-1",
        sentence1="Approach a task.",
        sentence2="To approach the city.",
        word="approach",
        span1=ExclusiveSpan(start=0, end=8),
        span2=ExclusiveSpan(start=3, end=11),
        label=False,
    ),
    Example(
        guid="train-2",
        sentence1="In England they call takeout food 'takeaway'.",
        sentence2="If you're hungry, there's a takeaway just around the corner.",
        word="takeaway",
        span1=ExclusiveSpan(start=35, end=43),
        span2=ExclusiveSpan(start=28, end=36),
        label=True,
    ),
]

TOKENIZED_EXAMPLES = [
    TokenizedExample(
        guid="train-1",