示例#1
0
    def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         label: str = None,
                         tag=None) -> Instance:
        premise_tokens = self._tokenizer.tokenize(premise)
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis)

        if self.max_seq_length != None:
            self._truncate_seq_pair(premise_tokens, hypothesis_tokens)

        premise_ids = self._tokenizer.convert_tokens_to_ids(premise_tokens)
        hypothesis_ids = self._tokenizer.convert_tokens_to_ids(
            hypothesis_tokens)

        input_ids = self._tokenizer.add_special_tokens_sentences_pair(
            premise_ids, hypothesis_ids)
        token_type_ids = self.get_token_type_ids(input_ids)
        attention_mask = [1] * len(input_ids)

        # Add padding if max_seq_length is defined
        if self.max_seq_length != None:
            padding = [0] * (self.max_seq_length - len(input_ids))
            input_ids += padding
            attention_mask += padding
            token_type_ids += padding

        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': premise_tokens,
            'hypothesis_tokens': hypothesis_tokens,
            'label': label,
            'tag': tag
        }

        fields = {
            'input_ids': ArrayField(np.array(input_ids), dtype=np.int64),
            'token_type_ids': ArrayField(np.array(token_type_ids),
                                         dtype=np.int64),
            'attention_mask': ArrayField(np.array(attention_mask),
                                         dtype=np.int64),
            'metadata': MetadataField(metadata)
        }

        if label is not None:
            fields['label'] = ArrayField(np.array(self._label_dict[label]),
                                         dtype=np.int64)

        return Instance(fields)
示例#2
0
 def text_to_instance(self,
                      features: np.ndarray,
                      premise: str,
                      hypothesis: str,
                      label: str = None) -> Instance:
     fields: Dict[str, Field] = {}
     fields['features'] = ArrayField(features)
     metadata = {
         'premise': premise,
         'hypothesis': hypothesis,
     }
     fields['metadata'] = MetadataField(metadata)
     if label:
         fields['label'] = LabelField(label)
     return Instance(fields)
示例#3
0
    def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = [x.text for x in self._tokenizer.tokenize(premise)]
        hypothesis_tokens = [
            x.text for x in self._tokenizer.tokenize(hypothesis)
        ]
        # n-grams from the premise
        prem_trigrams = set(skipgrams(premise_tokens, 3, 1))
        prem_bigrams = set(skipgrams(premise_tokens, 2, 1))
        prem_unigrams = set(ngrams(premise_tokens, 1))

        # n-grams from the hypothesis
        hyp_trigrams = set(skipgrams(hypothesis_tokens, 3, 1))
        hyp_bigrams = set(skipgrams(hypothesis_tokens, 2, 1))
        hyp_unigrams = set(ngrams(hypothesis_tokens, 1))

        # overlap proportions
        if hyp_trigrams:
            tri_overlap = len(
                prem_trigrams.intersection(hyp_trigrams)) / len(hyp_trigrams)
        else:
            0.0
        if hyp_bigrams:
            bi_overlap = len(
                prem_bigrams.intersection(hyp_bigrams)) / len(hyp_bigrams)
        else:
            0.0
        if hyp_unigrams:
            uni_overlap = len(
                prem_unigrams.intersection(hyp_unigrams)) / len(hyp_unigrams)
        else:
            0.0

        fields['features'] = FeaturesField(
            [tri_overlap, bi_overlap, uni_overlap])
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': premise_tokens,
            'hypothesis_tokens': hypothesis_tokens
        }
        fields['metadata'] = MetadataField(metadata)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)
    def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         hypothesis_structure: str,
                         label: str = None) -> Instance:
        fields: Dict[str, Field] = {}
        premise_tokens = self._tokenizer.tokenize(premise)[-self._max_tokens:]
        hypothesis_tokens = self._tokenizer.tokenize(
            hypothesis)[-self._max_tokens:]

        fields['premise'] = TextField(premise_tokens, self._token_indexers)
        fields['hypothesis'] = TextField(hypothesis_tokens,
                                         self._token_indexers)
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': [token.text for token in premise_tokens],
            'hypothesis_tokens': [token.text for token in hypothesis_tokens]
        }
        fields['metadata'] = MetadataField(metadata)
        self._add_structure_to_fields(hypothesis_structure, fields)
        if label:
            fields['label'] = LabelField(label)
        return Instance(fields)
    def make_reading_comprehension_instance(
            self,
            question_tokens: List[Token],
            passage_tokens: List[Token],
            token_indexers: Dict[str, TokenIndexer],
            passage_text: str,
            token_spans: List[Tuple[int, int]] = None,
            answer_texts: List[str] = None,
            additional_metadata: Dict[str, Any] = None) -> Instance:
        """
        Converts a question, a passage, and an optional answer (or answers) to an ``Instance`` for use
        in a reading comprehension model.

        Creates an ``Instance`` with at least these fields: ``question`` and ``passage``, both
        ``TextFields``; and ``metadata``, a ``MetadataField``.  Additionally, if both ``answer_texts``
        and ``char_span_starts`` are given, the ``Instance`` has ``span_start`` and ``span_end``
        fields, which are both ``IndexFields``.

        Parameters
        ----------
        question_tokens : ``List[Token]``
            An already-tokenized question.
        passage_tokens : ``List[Token]``
            An already-tokenized passage that contains the answer to the given question.
        token_indexers : ``Dict[str, TokenIndexer]``
            Determines how the question and passage ``TextFields`` will be converted into tensors that
            get input to a model.  See :class:`TokenIndexer`.
        passage_text : ``str``
            The original passage text.  We need this so that we can recover the actual span from the
            original passage that the model predicts as the answer to the question.  This is used in
            official evaluation scripts.
        token_spans : ``List[Tuple[int, int]]``, optional
            Indices into ``passage_tokens`` to use as the answer to the question for training.  This is
            a list because there might be several possible correct answer spans in the passage.
            Currently, we just select the most frequent span in this list (i.e., SQuAD has multiple
            annotations on the dev set; this will select the span that the most annotators gave as
            correct).
        answer_texts : ``List[str]``, optional
            All valid answer strings for the given question.  In SQuAD, e.g., the training set has
            exactly one answer per question, but the dev and test sets have several.  TriviaQA has many
            possible answers, which are the aliases for the known correct entity.  This is put into the
            metadata for use with official evaluation scripts, but not used anywhere else.
        additional_metadata : ``Dict[str, Any]``, optional
            The constructed ``metadata`` field will by default contain ``original_passage``,
            ``token_offsets``, ``question_tokens``, ``passage_tokens``, and ``answer_texts`` keys.  If
            you want any other metadata to be associated with each instance, you can pass that in here.
            This dictionary will get added to the ``metadata`` dictionary we already construct.
        """
        additional_metadata = additional_metadata or {}
        fields: Dict[str, Field] = {}

        # This is separate so we can reference it later with a known type.
        passage_field = TextField(passage_tokens, token_indexers)
        fields['passage'] = passage_field
        fields['question'] = TextField(question_tokens, token_indexers)
        metadata = {
            'original_passage': passage_text,
            'question_tokens': [token.text for token in question_tokens],
            'passage_tokens': [token.text for token in passage_tokens],
        }

        if answer_texts:
            metadata['answer_texts'] = answer_texts

        if token_spans:
            metadata["token_spans"] = token_spans

            # assume spans are sorted by some criteria
            span_start = token_spans[0][0]
            span_end = token_spans[0][1] - 1
            assert (span_start <= span_end)
            if span_end > len(passage_tokens) - 1:
                return None

            fields['span_start'] = IndexField(span_start, passage_field)
            fields['span_end'] = IndexField(span_end, passage_field)

        metadata.update(additional_metadata)
        fields['metadata'] = MetadataField(metadata)

        return Instance(fields)
示例#6
0
    def text_to_instance(self,
                         content: str,
                         candidates: List[str],
                         ground_truths: List[str] = None,
                         real_count: int = 1) -> Iterable[Instance]:
        splits = re.split(r'#idiom#', content)
        assert real_count + 1 == len(splits)
        assert real_count == len(candidates)
        split_tokens = [self.tokenizer.tokenize(item) for item in splits]
        for index, current_candidates in enumerate(candidates):
            before_part_tokens = [Token(token) for token in split_tokens[0]]
            for before_part in split_tokens[1:index + 1]:
                before_part_tokens += [Token('[UNK]')] + [
                    Token(token) for token in before_part
                ]
            after_part_tokens = [
                Token(token) for token in split_tokens[index + 1]
            ]
            for after_part in split_tokens[index + 2:]:
                after_part_tokens += [Token('[UNK]')] + [
                    Token(token) for token in after_part
                ]

            # 将 留空处 打上 [MASK]标记
            content_tokens = before_part_tokens + [Token('[MASK]')
                                                   ] + after_part_tokens

            # 取 留空 前后最多max_seq_length的内容作为输入
            half_length = self.max_seq_length // 2
            if len(before_part_tokens) < half_length:
                start = 0
                end = min(
                    len(before_part_tokens) + 1 + len(after_part_tokens),
                    self.max_seq_length - 2)
            elif len(after_part_tokens) < half_length:
                end = len(before_part_tokens) + 1 + len(after_part_tokens)
                start = max(0, end - (self.max_seq_length - 2))
            else:
                start = len(before_part_tokens) + 3 - half_length
                end = len(before_part_tokens) + 1 + half_length

            content_tokens = content_tokens[start:end]

            # 填空内容
            content_field = TextField(content_tokens,
                                      self.content_token_indexer)

            # 留空 的位置
            blank_index = content_tokens.index(Token("[MASK]"))
            blank_index_field = IndexField(blank_index, content_field)

            # 候选成语
            candidate_tokens = [
                self.idiom_list.index(option) for option in current_candidates
            ]
            candidate_tokens = np.array(candidate_tokens)
            candidate_field = ArrayField(candidate_tokens, dtype=np.long)

            fields = {
                "content": content_field,
                "blank_indices": blank_index_field,
                "candidates": candidate_field,
            }

            if ground_truths:
                label = current_candidates.index(ground_truths[index])
                label_field = LabelField(label, skip_indexing=True)
                fields["answer"] = label_field

                # 元信息
                meta = {
                    "content":
                    '[UNK]'.join(splits[:index + 1]) + "[MASK]" +
                    '[UNK]'.join(splits[index + 1:]),
                    "candidates":
                    current_candidates,
                    "answer":
                    ground_truths[index]
                }
            else:
                meta = {
                    "content":
                    '[UNK]'.join(splits[:index + 1]) + "[MASK]" +
                    '[UNK]'.join(splits[index + 1:]),
                    "candidates":
                    current_candidates,
                }
            fields["meta"] = MetadataField(meta)

            yield Instance(fields)
    def text_to_instance(self,
                         premise: str,
                         hypothesis: str,
                         label: str = None,
                         tag=None) -> Instance:
        ####################
        ##### Tokenization and truncation
        ####################
        premise_tokens = self._tokenizer.tokenize(premise.strip())
        hypothesis_tokens = self._tokenizer.tokenize(hypothesis.strip())
        premise_tokens, hypothesis_tokens = self._truncate_input(
            premise_tokens, hypothesis_tokens)

        ####################
        ##### Create ids for encoder inputs, decoder inputs and decoder targets
        ####################

        ## Create encoder inputs
        src = []
        src.append(
            self._tokenizer.add_special_tokens_single_sentence(
                self._tokenizer.convert_tokens_to_ids(
                    [self._tokenizer.entail_token] + premise_tokens)))
        src.append(
            self._tokenizer.add_special_tokens_single_sentence(
                self._tokenizer.convert_tokens_to_ids(
                    [self._tokenizer.neutral_token] + premise_tokens)))
        src.append(
            self._tokenizer.add_special_tokens_single_sentence(
                self._tokenizer.convert_tokens_to_ids(
                    [self._tokenizer.contradict_token] + premise_tokens)))
        assert len(src[0]) == len(src[1]) == len(src[2])
        src_length = len(src[0])

        ## Create decoder inputs and targets
        # Targets of the decoder: [<s> A B C D E <\s>]
        target = self._tokenizer.add_special_tokens_single_sentence(
            self._tokenizer.convert_tokens_to_ids(hypothesis_tokens))
        # Inputs of the decoder:  [<\s> <s> A B C D E]
        prev_output_tokens = [self._tokenizer.eos_token_id] + target[:-1]
        target_length = len(target)

        ####################
        ##### Padding of the input
        ####################
        # Pad the premise ids (the source)
        if self.max_premise_length:
            encoder_padding = [self._tokenizer.pad_token_id
                               ] * (self.max_premise_length - src_length)
            src = [s + encoder_padding for s in src]

        # Pad the hypothesis ids (the target)
        if self.max_hypothesis_length:
            decoder_padding = [self._tokenizer.pad_token_id
                               ] * (self.max_hypothesis_length - target_length)
            target += decoder_padding
            prev_output_tokens += decoder_padding

        # Replicate `prev_output_tokens` and `src_lengths` three times
        prev_output_tokens = [prev_output_tokens] * 3
        src_length = [src_length] * 3

        ####################
        ##### Create instance
        ####################
        metadata = {
            'premise': premise,
            'hypothesis': hypothesis,
            'premise_tokens': premise_tokens,
            'hypothesis_tokens': hypothesis_tokens,
            'label': label,
            'tag': tag
        }

        fields = {
            'src':
            ArrayField(np.array(src), dtype=np.int64),
            'src_lengths':
            ArrayField(np.array(src_length), dtype=np.int64),
            'prev_output_tokens':
            ArrayField(np.array(prev_output_tokens), dtype=np.int64),
            'target':
            ArrayField(np.array(target), dtype=np.int64),
            'target_lengths':
            ArrayField(np.array(target_length), dtype=np.int64),
            'metadata':
            MetadataField(metadata)
        }

        if label is not None:
            fields['label'] = ArrayField(np.array(self._label_dict[label]),
                                         dtype=np.int64)

        return Instance(fields)