def _load_checkpoint(self): '''Load checkpoint and state dict''' max_epoch = -1 for file in os.listdir(self.checkpoint_dir): if not file.startswith('epoch_'): continue _epoch = file.split('_')[-1] if not _epoch.isdigit(): continue max_epoch = max(max_epoch, int(_epoch)) if max_epoch == -1: if self.local_rank == 0: logger.warning('PaddleHub model checkpoint not found, start from scratch...') return # load best metrics self._load_metrics() self.current_epoch = max_epoch metric_msg = ['{}={:.4f}'.format(metric, value) for metric, value in self.best_metrics.items()] metric_msg = ' '.join(metric_msg) if self.local_rank == 0: logger.info('PaddleHub model checkpoint loaded. current_epoch={} [{}]'.format( self.current_epoch, metric_msg)) model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch)) self.load_model(model_path)
def __init__( self, task=None, load_checkpoint=None, label_map=None, ): super(Bert, self).__init__() # TODO(zhangxuefei): add token_classification task if task == 'sequence_classification': self.model = BertForSequenceClassification.from_pretrained( pretrained_model_name_or_path='bert-large-cased') self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy(name='acc_accumulation') elif task is None: self.model = BertModel.from_pretrained( pretrained_model_name_or_path='bert-large-cased') else: raise RuntimeError( "Unknown task %s, task should be sequence_classification" % task) self.task = task self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
def __init__( self, task: str, num_class: int = None, label_map: Dict = None, load_checkpoint: str = None, **kwargs, ): super(PANN, self).__init__() if label_map: self.label_map = label_map self.num_class = len(label_map) else: self.num_class = num_class if task == 'sound-cls': self.cnn6 = CNN6(extract_embedding=True, checkpoint=os.path.join(MODULE_HOME, 'panns_cnn6', 'cnn6.pdparams')) self.dropout = nn.Dropout(0.1) self.fc = nn.Linear(self.cnn6.emb_size, num_class) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() else: self.cnn6 = CNN6(extract_embedding=False, checkpoint=os.path.join(MODULE_HOME, 'panns_cnn6', 'cnn6.pdparams')) self.task = task if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Converts all examples to records which the model needs. Args: examples(obj:`List[InputExample]`): All data examples returned by _read_file. Returns: records(:obj:`List[dict]`): All records which the model needs. """ records = [] for example in examples: record = self.tokenizer.encode(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab # When all words are not found in the vocab, the text will be dropped. if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if example.label: record['label'] = self.label_map[example.label] records.append(record) return records
def __init__( self, task: str = None, load_checkpoint: str = None, label_map: Dict = None, num_classes: int = 2, **kwargs, ): super(Electra, self).__init__() if label_map: self.label_map = label_map self.num_classes = len(label_map) else: self.num_classes = num_classes if task == 'sequence_classification': task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " "'sequence_classification' has been deprecated and will be removed in the future.", ) if task == 'seq-cls': self.model = ElectraForSequenceClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task == 'token-cls': self.model = ElectraForTokenClassification.from_pretrained( pretrained_model_name_or_path='electra-small', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = ChunkEvaluator(label_list=[ self.label_map[i] for i in sorted(self.label_map.keys()) ]) elif task == 'text-matching': self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) self.dropout = paddle.nn.Dropout(0.1) self.classifier = paddle.nn.Linear( self.model.config['hidden_size'] * 3, 2) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task is None: self.model = ElectraModel.from_pretrained( pretrained_model_name_or_path='electra-small', **kwargs) else: raise RuntimeError( "Unknown task {}, task should be one in {}".format( task, self._tasks_supported)) self.task = task if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Converts all examples to records which the model needs. Args: examples(obj:`List[InputExample]`): All data examples returned by _read_file. Returns: records(:obj:`List[dict]`): All records which the model needs. """ records = [] for example in examples: if isinstance(self.tokenizer, PretrainedTokenizer): record_a = self.tokenizer(text=example.text_a, max_seq_len=self.max_seq_len, \ pad_to_max_seq_len=True, return_length=True) record_b = self.tokenizer(text=example.text_b, max_seq_len=self.max_seq_len, \ pad_to_max_seq_len=True, return_length=True) record = {'text_a': record_a, 'text_b': record_b} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if example.label: record['label'] = self.label_map[example.label] records.append(record) return records
def _convert_examples_to_records(self, examples, phase): ''' Returns a list[dict] including all the input information what the model need. Args: examples (list): the data example, returned by _read_file. phase (str): the processing phase, can be 'train' 'dev' 'test' or 'predict'. Returns: a list with all the examples record. ''' records = [] with tqdm(total=len(examples)) as process_bar: for example in examples: record = self.tokenizer.encode(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab # When all words are not found in the vocab, the text will be dropped. if not record: logger.info( 'The text %s has been dropped as it has no words in the vocab after tokenization.' % example.text_a) continue if example.label: record['label'] = self.label_list.index( example.label) if self.label_list else float( example.label) records.append(record) process_bar.update(1) return records
def delete_hook(self, hook_type: str, name: str): ''' delete the handler function of spectific event. Args: hook_type (str): the spectific event name name (str): the handler function name ''' self._hooks.delete(hook_type, name) logger.info('Delete hook {}:{} successfully'.format(hook_type, name))
def test_records(self): if not self._test_records: examples = self.test_examples if not self.tokenizer or not examples: return [] logger.info('Processing the test set...') self._test_records = self._convert_examples_to_records( examples, phase='test') return self._test_records
def predict_records(self): if not self._predict_records: examples = self.predict_examples if not self.tokenizer or not examples: return [] logger.info('Processing the predict set...') self._predict_records = self._convert_examples_to_records( examples, phase='predict') return self._predict_records
def dev_records(self): if not self._dev_records: examples = self.dev_examples if not self.tokenizer or not examples: return [] logger.info('Processing the dev set...') self._dev_records = self._convert_examples_to_records(examples, phase='dev') return self._dev_records
def set_speaker_embedding(self, speaker_audio: str): assert os.path.exists( speaker_audio ), f'Speaker audio file: {speaker_audio} does not exists.' mel_sequences = self.speaker_processor.extract_mel_partials( self.speaker_processor.preprocess_wav(speaker_audio)) self._speaker_embedding = self.speaker_encoder.embed_utterance( paddle.to_tensor(mel_sequences)) logger.info( f'Speaker embedding has been set from file: {speaker_audio}')
def modify_hook(self, hook_type: str, name: str, func: Callable): ''' modify the handler function of spectific event. Args: hook_type (str): the spectific event name name (str): the handler function name func (func): the new handler function ''' self._hooks.modify(hook_type, name, func) logger.info('Modify hook {}:{} successfully'.format(hook_type, name))
def read_images(self, paths=[]): images = [] for img_path in paths: assert os.path.isfile( img_path), "The {} isn't a valid file.".format(img_path) img = cv2.imread(img_path) if img is None: logger.info("error in loading image:{}".format(img_path)) continue img = img[:, :, ::-1] images.append(img) return images
def init_if_load_best_model(self): if not self.is_best_model_loaded: best_model_path = os.path.join(self.config.checkpoint_dir, "best_model") logger.info("Load the best model from %s" % best_model_path) if os.path.exists(best_model_path): self.load_parameters(best_model_path) self.is_checkpoint_loaded = False self.is_best_model_loaded = True else: self.init_if_necessary() else: logger.info("The best model has been loaded")
def add_hook(self, hook_type: str, name: str = None, func: Callable = None): ''' add the handler function to spectific event. Args: hook_type (str): the spectific event name name (str): the handler function name, default None func (func): the handler function, default None ''' if name == None: name = 'hook_{}'.format(id(func)) self._hooks.add(hook_type, name=name, func=func) logger.info('Add hook {}:{} successfully'.format(hook_type, name))
def _download_and_uncompress_dataset(self, destination: str, url: str): """ Downloads dataset and uncompresses it. Args: destination (:obj:`str`): The dataset cached directory. url (:obj: str): The link to be downloaded a dataset. """ if not os.path.exists(destination): dataset_package = download(url=url, path=DATA_HOME) if is_xarfile(dataset_package): unarchive(dataset_package, DATA_HOME) else: logger.info("Dataset {} already cached.".format(destination))
def __init__( self, task=None, load_checkpoint=None, label_map=None, num_classes=2, **kwargs, ): super(ErnieTiny, self).__init__() if label_map: self.num_classes = len(label_map) else: self.num_classes = num_classes if task == 'sequence_classification': task = 'seq-cls' logger.warning( "current task name 'sequence_classification' was renamed to 'seq-cls', " "'sequence_classification' has been deprecated and will be removed the future.", ) if task == 'seq-cls': self.model = ErnieForSequenceClassification.from_pretrained( pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task == 'token-cls': self.model = ErnieForTokenClassification.from_pretrained( pretrained_model_name_or_path='ernie-tiny', num_classes=self.num_classes, **kwargs) self.criterion = paddle.nn.loss.CrossEntropyLoss() self.metric = paddle.metric.Accuracy() elif task is None: self.model = ErnieModel.from_pretrained( pretrained_model_name_or_path='ernie-tiny', **kwargs) else: raise RuntimeError( "Unknown task {}, task should be one in {}".format( task, self._tasks_supported)) self.task = task self.label_map = label_map if load_checkpoint is not None and os.path.isfile(load_checkpoint): state_dict = paddle.load(load_checkpoint) self.set_state_dict(state_dict) logger.info('Loaded parameters from %s' % os.path.abspath(load_checkpoint))
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Converts all examples to records which the model needs. Args: examples(obj:`List[InputExample]`): All data examples returned by _read_file. Returns: records(:obj:`List[dict]`): All records which the model needs. """ records = [] for example in examples: if isinstance(self.tokenizer, PretrainedTokenizer): if Version(paddlenlp.__version__) <= Version('2.0.0rc2'): record = self.tokenizer.encode( text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len) else: record = self.tokenizer(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len, pad_to_max_seq_len=True, return_length=True) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token ids = self.tokenizer.encode(sentence=example.text_a) seq_len = min(len(ids), self.max_seq_len) if len(ids) > self.max_seq_len: ids = trunc_sequence(ids, self.max_seq_len) else: pad_token_id = self.tokenizer.vocab.to_indices(pad_token) ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if example.label: record['label'] = self.label_map[example.label] records.append(record) return records
def __init__(self, dataset: Generic, random_seed: int = None): self.dataset = dataset self.num_examples = {'train': -1, 'dev': -1, 'test': -1} np.random.seed(random_seed) # generate label map self.label_map = {} try: for index, label in enumerate(self.dataset.get_labels()): self.label_map[label] = index logger.info('Dataset label map = {}'.format(self.label_map)) except: # some dataset like squad, its label_list=None logger.info( 'Dataset is None or it has not any labels, label map = {}'. format(self.label_map))
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Returns a list[dict] including all the input information what the model need. Args: examples (list): the data examples, returned by _read_file. Returns: a list with all the examples record. """ records = [] for example in examples: tokens, labels = reseg_token_label( tokenizer=self.tokenizer, tokens=example.text_a.split(self.split_char), labels=example.label.split(self.split_char)) record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab # When all words are not found in the vocab, the text will be dropped. if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue if labels: record["label"] = [] tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens( record['input_ids']) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( tokens) and token == tokens[tokens_index]: record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [self.tokenizer.pad_token]: record["label"].append( self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) records.append(record) return records
def load_checkpoint( checkpoint_dir: str, exe: paddle.static.Executor, main_program: paddle.static.Program) -> Tuple[bool, int, int, float]: ckpt_meta_path = os.path.join(checkpoint_dir, CKPT_FILE_NAME) ckpt = checkpoint_pb2.CheckPoint() logger.info('Try loading checkpoint from {}'.format(ckpt_meta_path)) if os.path.exists(ckpt_meta_path): with open(ckpt_meta_path, 'rb') as f: ckpt.ParseFromString(f.read()) current_epoch = 1 global_step = 0 best_score = -999 if ckpt.latest_model_dir: paddle.static.load(executor=exe, model_path=ckpt.latest_model_dir, program=main_program) # Compatible with older versions without best_score in checkpoint_pb2 try: best_score = ckpt.best_score except: best_score = -999 logger.info('PaddleHub model checkpoint loaded. current_epoch={}, ' 'global_step={}, best_score={:.5f}'.format( ckpt.current_epoch, ckpt.global_step, best_score)) return True, ckpt.current_epoch, ckpt.global_step, best_score logger.info('PaddleHub model checkpoint not found, start from scratch...') return False, current_epoch, global_step, best_score
def predict(self, images=[], paths=[], top_k=1): """ Args: images (list(numpy.ndarray)): images data, shape of each is [H, W, C]. If images not paths paths (list[str]): The paths of images. If paths not images Returns: res (list): The result of chinese texts and save path of images. """ if images != [] and isinstance(images, list) and paths == []: predicted_data = images elif images == [] and isinstance(paths, list) and paths != []: predicted_data = self.read_images(paths) else: raise TypeError( "The input data is inconsistent with expectations.") assert predicted_data != [], "There is not any image to be predicted. Please check the input data." all_results = [] for img in predicted_data: if img is None: logger.info("error in loading image") all_results.append([]) continue self.args.image_file = img self.args.top_k = top_k starttime = time.time() classes, scores = paddle_predict.predict(self.args, self.predictor) elapse = time.time() - starttime logger.info("Predict time: {}".format(elapse)) all_results.append([classes.tolist(), scores.tolist(), elapse]) return all_results
def __init__(self, extract_embedding: bool = True, checkpoint: str = None): super(CNN14, self).__init__() self.bn0 = nn.BatchNorm2D(64) self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) self.fc1 = nn.Linear(2048, self.emb_size) self.fc_audioset = nn.Linear(self.emb_size, 527) if checkpoint is not None and os.path.isfile(checkpoint): state_dict = paddle.load(checkpoint) self.set_state_dict(state_dict) logger.info( f'Loaded CNN14 pretrained parameters from: {checkpoint}') else: logger.error( 'No valid checkpoints for CNN14. Start training from scratch.') self.extract_embedding = extract_embedding
def _default_predict_end_event(self, run_states: List[RunState]): logger.info('PaddleHub predict finished.')
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Instantiate an instance of `PretrainedModel` from a predefined model specified by name or path. Args: pretrained_model_name_or_path (str): A name of or a file path to a pretrained model. *args (tuple): position arguments for `__init__`. If provide, use this as position argument values for model initialization. **kwargs (dict): keyword arguments for `__init__`. If provide, use this to update pre-defined keyword argument values for model initialization. Returns: PretrainedModel: An instance of PretrainedModel. """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) else: if os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join( pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: raise ValueError( "Calling {}.from_pretrained() with a model identifier or the " "path to a directory instead. The supported model " "identifiers are as follows: {}".format( cls.__name__, cls.pretrained_init_configuration.keys())) # FIXME(chenzeyu01): We should use another data path for storing model default_root = os.path.join(DATA_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): path = os.path.join(default_root, file_path.split('/')[-1]) if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path elif os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name break base_args = arg.pop("init_args", ()) base_kwargs = arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_kwargs.update(kwargs) model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = list(resolved_resource_files.values())[0] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" state_dict = paddle.load(weight_path) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) model_to_load.set_state_dict(state_to_load) if paddle.in_dynamic_mode(): return model return model, state_to_load
def _save_checkpoint(self): '''Save model checkpoint and state dict''' model_path = os.path.join(self.checkpoint_dir, 'epoch_{}'.format(self.current_epoch)) logger.info('Saving model checkpoint to {}'.format(model_path)) self.save_model(model_path)
def _default_eval_start_event(self): logger.info('Evaluation on {} dataset start'.format(self.phase))
def _default_predict_start_event(self): logger.info('PaddleHub predict start')
def _convert_examples_to_records( self, examples: List[InputExample]) -> List[dict]: """ Returns a list[dict] including all the input information what the model need. Args: examples (list): the data examples, returned by _read_file. Returns: a list with all the examples record. """ records = [] for example in examples: tokens = example.text_a.split(self.split_char) labels = example.label.split(self.split_char) # convert tokens into record if isinstance(self.tokenizer, PretrainedTokenizer): pad_token = self.tokenizer.pad_token tokens, labels = reseg_token_label(tokenizer=self.tokenizer, tokens=tokens, labels=labels) record = self.tokenizer.encode(text=tokens, max_seq_len=self.max_seq_len) elif isinstance(self.tokenizer, JiebaTokenizer): pad_token = self.tokenizer.vocab.pad_token ids = [ self.tokenizer.vocab.to_indices(token) for token in tokens ] seq_len = min(len(ids), self.max_seq_len) if len(ids) > self.max_seq_len: ids = trunc_sequence(ids, self.max_seq_len) else: pad_token_id = self.tokenizer.vocab.to_indices(pad_token) ids = pad_sequence(ids, self.max_seq_len, pad_token_id) record = {'text': ids, 'seq_len': seq_len} else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) if not record: logger.info( "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a) continue # convert labels into record if labels: record["label"] = [] if isinstance(self.tokenizer, PretrainedTokenizer): tokens_with_specical_token = self.tokenizer.convert_ids_to_tokens( record['input_ids']) elif isinstance(self.tokenizer, JiebaTokenizer): tokens_with_specical_token = [ self.tokenizer.vocab.to_tokens(id_) for id_ in record['text'] ] else: raise RuntimeError( "Unknown type of self.tokenizer: {}, it must be an instance of PretrainedTokenizer or JiebaTokenizer" .format(type(self.tokenizer))) tokens_index = 0 for token in tokens_with_specical_token: if tokens_index < len( tokens) and token == tokens[tokens_index]: record["label"].append( self.label_list.index(labels[tokens_index])) tokens_index += 1 elif token in [pad_token]: record["label"].append( self.ignore_label) # label of special token else: record["label"].append( self.label_list.index(self.no_entity_label)) records.append(record) return records