def _convert_text_to_input(self, tokenizer, text: List[str], max_seq_len: int, split_char: str): pad_to_max_seq_len = False if self.task is None else True if self.task == 'token-cls': # Extra processing of token-cls task tokens = text[0].split(split_char) text[0], _ = reseg_token_label(tokenizer=tokenizer, tokens=tokens) is_split_into_words = True else: is_split_into_words = False if len(text) == 1: if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): encoded_inputs = tokenizer.encode( text[0], text_pair=None, max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) else: encoded_inputs = tokenizer( text=text[0], max_seq_len=max_seq_len, pad_to_max_seq_len=True, is_split_into_words=is_split_into_words, return_length=True) elif len(text) == 2: if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): encoded_inputs = tokenizer.encode( text[0], text_pair=text[1], max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) else: encoded_inputs = tokenizer( text=text[0], text_pair=text[1], max_seq_len=max_seq_len, pad_to_max_seq_len=True, is_split_into_words=is_split_into_words, return_length=True) else: raise RuntimeError( 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) return encoded_inputs
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Returns a list[dict] including all the input information what the model need. Args: examples (list): the data examples, returned by _read_file. Returns: a list with all the examples record. """ records = [] for example in examples: tokens, labels = reseg_token_label( tokenizer=self.tokenizer, tokens=example.text_a.split(self.split_char), labels=example.label.split(self.split_char)) record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab # When all words are not found in the vocab, the text will be dropped. if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if labels: record["label"] = [] tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens( record['input_ids']) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( tokens) and token == tokens[tokens_index]: record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [self.tokenizer.pad_token]: record["label"].append( self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) records.append(record) return records
def _convert_text_to_input(self, tokenizer, text: List[str], max_seq_len: int, split_char: str): pad_to_max_seq_len = False if self.task is None else True if self.task == 'token-cls': # Extra processing of token-cls task tokens = text[0].split(split_char) text[0], _ = reseg_token_label(tokenizer=tokenizer, tokens=tokens) if len(text) == 1: encoded_inputs = tokenizer.encode( text[0], text_pair=None, max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) elif len(text) == 2: encoded_inputs = tokenizer.encode( text[0], text_pair=text[1], max_seq_len=max_seq_len, pad_to_max_seq_len=pad_to_max_seq_len) else: raise RuntimeError( 'The input text must have one or two sequence, but got %d. Please check your inputs.' % len(text)) return encoded_inputs
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Returns a list[dict] including all the input information what the model need. Args: examples (list): the data examples, returned by _read_file. Returns: a list with all the examples record. """ records = [] for example in examples: tokens = example.text_a.split(self.split_char) labels = example.label.split(self.split_char) # convert tokens into record if isinstance(self.tokenizer, PretrainedTokenizer): pad_token = self.tokenizer.pad_token tokens, labels = reseg_token_label(tokenizer=self.tokenizer, tokens=tokens, labels=labels) record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token ids = [ self.tokenizer.vocab.to_indices(token) for token in tokens ] seq_len = min(len(ids), self.max_seq_len) if len(ids) > self.max_seq_len: ids = trunc_sequence(ids, self.max_seq_len) else: pad_token_id = self.tokenizer.vocab.to_indices(pad_token) ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue # convert labels into record if labels: record["label"] = [] if isinstance(self.tokenizer, PretrainedTokenizer): tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens( record['input_ids']) elif isinstance(self.tokenizer, JiebaTokenizer): tokens_with_specical_token = [ self.tokenizer.vocab.to_tokens(id_) for id_ in record['text'] ] else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( tokens) and token == tokens[tokens_index]: record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [pad_token]: record["label"].append( self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) records.append(record) return records