Exemplo n.º 1
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Converts all examples to records which the model needs.
        Args:
            examples(obj:`List[InputExample]`): All data examples returned by _read_file.
        Returns:
            records(:obj:`List[dict]`): All records which the model needs.
        """
        records = []
        for example in examples:
            if isinstance(self.tokenizer, PretrainedTokenizer):
                if Version(paddlenlp.__version__) <= Version('2.0.0rc2'):
                    record = self.tokenizer.encode(
                        text=example.text_a,
                        text_pair=example.text_b,
                        max_seq_len=self.max_seq_len)
                else:
                    record = self.tokenizer(text=example.text_a,
                                            text_pair=example.text_b,
                                            max_seq_len=self.max_seq_len,
                                            pad_to_max_seq_len=True,
                                            return_length=True)
            elif isinstance(self.tokenizer, JiebaTokenizer):
                pad_token = self.tokenizer.vocab.pad_token

                ids = self.tokenizer.encode(sentence=example.text_a)
                seq_len = min(len(ids), self.max_seq_len)
                if len(ids) > self.max_seq_len:
                    ids = trunc_sequence(ids, self.max_seq_len)
                else:
                    pad_token_id = self.tokenizer.vocab.to_indices(pad_token)
                    ids = pad_sequence(ids, self.max_seq_len, pad_token_id)
                record = {'text': ids, 'seq_len': seq_len}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue
            if example.label:
                record['label'] = self.label_map[example.label]
            records.append(record)
        return records
Exemplo n.º 2
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Returns a list[dict] including all the input information what the model need.
        Args:
            examples (list): the data examples, returned by _read_file.
        Returns:
            a list with all the examples record.
        """
        records = []
        for example in examples:
            tokens = example.text_a.split(self.split_char)
            labels = example.label.split(self.split_char)

            # convert tokens into record
            if isinstance(self.tokenizer, PretrainedTokenizer):
                pad_token = self.tokenizer.pad_token

                tokens, labels = reseg_token_label(tokenizer=self.tokenizer,
                                                   tokens=tokens,
                                                   labels=labels)
                record = self.tokenizer.encode(text=tokens,
                                               max_seq_len=self.max_seq_len)
            elif isinstance(self.tokenizer, JiebaTokenizer):
                pad_token = self.tokenizer.vocab.pad_token

                ids = [
                    self.tokenizer.vocab.to_indices(token) for token in tokens
                ]
                seq_len = min(len(ids), self.max_seq_len)
                if len(ids) > self.max_seq_len:
                    ids = trunc_sequence(ids, self.max_seq_len)
                else:
                    pad_token_id = self.tokenizer.vocab.to_indices(pad_token)
                    ids = pad_sequence(ids, self.max_seq_len, pad_token_id)

                record = {'text': ids, 'seq_len': seq_len}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue

            # convert labels into record
            if labels:
                record["label"] = []
                if isinstance(self.tokenizer, PretrainedTokenizer):
                    tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(
                        record['input_ids'])
                elif isinstance(self.tokenizer, JiebaTokenizer):
                    tokens_with_specical_token = [
                        self.tokenizer.vocab.to_tokens(id_)
                        for id_ in record['text']
                    ]
                else:
                    raise RuntimeError(
                        "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                        .format(type(self.tokenizer)))

                tokens_index = 0
                for token in tokens_with_specical_token:
                    if tokens_index < len(
                            tokens) and token == tokens[tokens_index]:
                        record["label"].append(
                            self.label_list.index(labels[tokens_index]))
                        tokens_index += 1
                    elif token in [pad_token]:
                        record["label"].append(
                            self.ignore_label)  # label of special token
                    else:
                        record["label"].append(
                            self.label_list.index(self.no_entity_label))
            records.append(record)
        return records