示例#1
0
    def _convert_text_to_input(self, tokenizer, text: List[str],
                               max_seq_len: int, split_char: str):
        pad_to_max_seq_len = False if self.task is None else True
        if self.task == 'token-cls':  # Extra processing of token-cls task
            tokens = text[0].split(split_char)
            text[0], _ = reseg_token_label(tokenizer=tokenizer, tokens=tokens)
            is_split_into_words = True
        else:
            is_split_into_words = False

        if len(text) == 1:
            if Version(paddlenlp.__version__) <= Version('2.0.0rc2'):
                encoded_inputs = tokenizer.encode(
                    text[0],
                    text_pair=None,
                    max_seq_len=max_seq_len,
                    pad_to_max_seq_len=pad_to_max_seq_len)
            else:
                encoded_inputs = tokenizer(
                    text=text[0],
                    max_seq_len=max_seq_len,
                    pad_to_max_seq_len=True,
                    is_split_into_words=is_split_into_words,
                    return_length=True)
        elif len(text) == 2:
            if Version(paddlenlp.__version__) <= Version('2.0.0rc2'):
                encoded_inputs = tokenizer.encode(
                    text[0],
                    text_pair=text[1],
                    max_seq_len=max_seq_len,
                    pad_to_max_seq_len=pad_to_max_seq_len)
            else:
                encoded_inputs = tokenizer(
                    text=text[0],
                    text_pair=text[1],
                    max_seq_len=max_seq_len,
                    pad_to_max_seq_len=True,
                    is_split_into_words=is_split_into_words,
                    return_length=True)
        else:
            raise RuntimeError(
                'The input text must have one or two sequence, but got %d. Please check your inputs.'
                % len(text))
        return encoded_inputs
示例#2
0
 def _convert_examples_to_records(
         self, examples: List[InputExample]) -> List[dict]:
     """
     Returns a list[dict] including all the input information what the model need.
     Args:
         examples (list): the data examples, returned by _read_file.
     Returns:
         a list with all the examples record.
     """
     records = []
     for example in examples:
         tokens, labels = reseg_token_label(
             tokenizer=self.tokenizer,
             tokens=example.text_a.split(self.split_char),
             labels=example.label.split(self.split_char))
         record = self.tokenizer.encode(text=tokens,
                                        max_seq_len=self.max_seq_len)
         # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
         # When all words are not found in the vocab, the text will be dropped.
         if not record:
             logger.info(
                 "The text %s has been dropped as it has no words in the vocab after tokenization."
                 % example.text_a)
             continue
         if labels:
             record["label"] = []
             tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(
                 record['input_ids'])
             tokens_index = 0
             for token in tokens_with_specical_token:
                 if tokens_index < len(
                         tokens) and token == tokens[tokens_index]:
                     record["label"].append(
                         self.label_list.index(labels[tokens_index]))
                     tokens_index += 1
                 elif token in [self.tokenizer.pad_token]:
                     record["label"].append(
                         self.ignore_label)  # label of special token
                 else:
                     record["label"].append(
                         self.label_list.index(self.no_entity_label))
         records.append(record)
     return records
示例#3
0
    def _convert_text_to_input(self, tokenizer, text: List[str],
                               max_seq_len: int, split_char: str):
        pad_to_max_seq_len = False if self.task is None else True
        if self.task == 'token-cls':  # Extra processing of token-cls task
            tokens = text[0].split(split_char)
            text[0], _ = reseg_token_label(tokenizer=tokenizer, tokens=tokens)

        if len(text) == 1:
            encoded_inputs = tokenizer.encode(
                text[0],
                text_pair=None,
                max_seq_len=max_seq_len,
                pad_to_max_seq_len=pad_to_max_seq_len)
        elif len(text) == 2:
            encoded_inputs = tokenizer.encode(
                text[0],
                text_pair=text[1],
                max_seq_len=max_seq_len,
                pad_to_max_seq_len=pad_to_max_seq_len)
        else:
            raise RuntimeError(
                'The input text must have one or two sequence, but got %d. Please check your inputs.'
                % len(text))
        return encoded_inputs
示例#4
0
    def _convert_examples_to_records(
            self, examples: List[InputExample]) -> List[dict]:
        """
        Returns a list[dict] including all the input information what the model need.
        Args:
            examples (list): the data examples, returned by _read_file.
        Returns:
            a list with all the examples record.
        """
        records = []
        for example in examples:
            tokens = example.text_a.split(self.split_char)
            labels = example.label.split(self.split_char)

            # convert tokens into record
            if isinstance(self.tokenizer, PretrainedTokenizer):
                pad_token = self.tokenizer.pad_token

                tokens, labels = reseg_token_label(tokenizer=self.tokenizer,
                                                   tokens=tokens,
                                                   labels=labels)
                record = self.tokenizer.encode(text=tokens,
                                               max_seq_len=self.max_seq_len)
            elif isinstance(self.tokenizer, JiebaTokenizer):
                pad_token = self.tokenizer.vocab.pad_token

                ids = [
                    self.tokenizer.vocab.to_indices(token) for token in tokens
                ]
                seq_len = min(len(ids), self.max_seq_len)
                if len(ids) > self.max_seq_len:
                    ids = trunc_sequence(ids, self.max_seq_len)
                else:
                    pad_token_id = self.tokenizer.vocab.to_indices(pad_token)
                    ids = pad_sequence(ids, self.max_seq_len, pad_token_id)

                record = {'text': ids, 'seq_len': seq_len}
            else:
                raise RuntimeError(
                    "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                    .format(type(self.tokenizer)))

            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue

            # convert labels into record
            if labels:
                record["label"] = []
                if isinstance(self.tokenizer, PretrainedTokenizer):
                    tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens(
                        record['input_ids'])
                elif isinstance(self.tokenizer, JiebaTokenizer):
                    tokens_with_specical_token = [
                        self.tokenizer.vocab.to_tokens(id_)
                        for id_ in record['text']
                    ]
                else:
                    raise RuntimeError(
                        "Unknown type of self.tokenizer: {}, it must be an instance of  PretrainedTokenizer or JiebaTokenizer"
                        .format(type(self.tokenizer)))

                tokens_index = 0
                for token in tokens_with_specical_token:
                    if tokens_index < len(
                            tokens) and token == tokens[tokens_index]:
                        record["label"].append(
                            self.label_list.index(labels[tokens_index]))
                        tokens_index += 1
                    elif token in [pad_token]:
                        record["label"].append(
                            self.ignore_label)  # label of special token
                    else:
                        record["label"].append(
                            self.label_list.index(self.no_entity_label))
            records.append(record)
        return records