def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Converts all examples to records which the model needs. Args: examples(obj:`List[InputExample]`): All data examples returned by _read_file. Returns: records(:obj:`List[dict]`): All records which the model needs. """ records = [] for example in examples: if isinstance(self.tokenizer, PretrainedTokenizer): if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): record = self.tokenizer.encode( text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) else: record = self.tokenizer(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len, pad_to_max_seq_len=True, return_length=True) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token ids = self.tokenizer.encode(sentence=example.text_a) seq_len = min(len(ids), self.max_seq_len) if len(ids) > self.max_seq_len: ids = trunc_sequence(ids, self.max_seq_len) else: pad_token_id = self.tokenizer.vocab.to_indices(pad_token) ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if example.label: record['label'] = self.label_map[example.label] records.append(record) return records
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Returns a list[dict] including all the input information what the model need. Args: examples (list): the data examples, returned by _read_file. Returns: a list with all the examples record. """ records = [] for example in examples: tokens = example.text_a.split(self.split_char) labels = example.label.split(self.split_char) # convert tokens into record if isinstance(self.tokenizer, PretrainedTokenizer): pad_token = self.tokenizer.pad_token tokens, labels = reseg_token_label(tokenizer=self.tokenizer, tokens=tokens, labels=labels) record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token ids = [ self.tokenizer.vocab.to_indices(token) for token in tokens ] seq_len = min(len(ids), self.max_seq_len) if len(ids) > self.max_seq_len: ids = trunc_sequence(ids, self.max_seq_len) else: pad_token_id = self.tokenizer.vocab.to_indices(pad_token) ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue # convert labels into record if labels: record["label"] = [] if isinstance(self.tokenizer, PretrainedTokenizer): tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens( record['input_ids']) elif isinstance(self.tokenizer, JiebaTokenizer): tokens_with_specical_token = [ self.tokenizer.vocab.to_tokens(id_) for id_ in record['text'] ] else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( tokens) and token == tokens[tokens_index]: record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [pad_token]: record["label"].append( self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) records.append(record) return records