def __init__(self, dataset, vocab_path, label_map_config=None, max_seq_len=512, do_lower_case=True, random_seed=None): self.max_seq_len = max_seq_len self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.dataset = dataset self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.in_tokens = False np.random.seed(random_seed) # generate label map self.label_map = {} for index, label in enumerate(self.dataset.get_labels()): self.label_map[label] = index logger.info("Dataset label map = {}".format(self.label_map)) self.current_example = 0 self.current_epoch = 0 self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def __init__(self, vocab_path, dataset=None, label_map_config=None, max_seq_len=512, do_lower_case=True, random_seed=None, use_task_id=False, sp_model_path=None, word_dict_path=None, in_tokens=False): super(SequenceLabelReader, self).__init__( vocab_path=vocab_path, dataset=dataset, label_map_config=label_map_config, max_seq_len=max_seq_len, do_lower_case=do_lower_case, random_seed=random_seed, use_task_id=use_task_id, sp_model_path=sp_model_path, word_dict_path=word_dict_path, in_tokens=in_tokens) if sp_model_path and word_dict_path: self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case, use_sentence_piece_vocab=True)
def __init__(self, dataset, vocab_path): self.dataset = dataset self.lac = hub.Module(name="lac") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=False) self.vocab = self.tokenizer.vocab self.feed_key = list( self.lac.processor.data_format( sign_name="lexical_analysis").keys())[0] self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
def __init__(self, vocab_path, dataset=None, in_tokens=False): super(LACClassifyReader, self).__init__(dataset) self.in_tokens = in_tokens self.lac = hub.Module(name="lac") self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path, do_lower_case=False) self.vocab = self.tokenizer.vocab self.feed_key = list( self.lac.processor.data_format( sign_name="lexical_analysis").keys())[0]
def __init__(self, vocab_path, dataset=None, in_tokens=False): super(LACClassifyReader, self).__init__(dataset) self.in_tokens = in_tokens self.lac = hub.Module(name="lac") self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=False) self.vocab = self.tokenizer.vocab self.has_processed = { "train": False, "dev": False, "val": False, "test": False, "predict": False }
def _initialize(self): """ initialize with the necessary elements """ self.pretrained_model_path = os.path.join(self.directory, "infer_model") self.tokenizer_vocab_path = os.path.join(self.directory, "assets", "vocab.txt") self.vocab_path = os.path.join(self.directory, "assets", "word_dict.txt") self.vocab = load_vocab(self.vocab_path) self.sequence_max_len = 256 self.tokenizer = tokenization.FullTokenizer(self.tokenizer_vocab_path) self.param_file = os.path.join(self.directory, "assets", "params.txt") self._set_config()
def __init__(self, vocab_path, dataset=None, label_map_config=None, max_seq_len=512, do_lower_case=True, random_seed=None, use_task_id=False, sp_model_path=None, word_dict_path=None, in_tokens=False): super(BaseNLPReader, self).__init__(dataset, random_seed) self.max_seq_len = max_seq_len if sp_model_path and word_dict_path: self.tokenizer = tokenization.WSSPTokenizer(vocab_path, sp_model_path, word_dict_path, ws=True, lower=True) else: self.tokenizer = tokenization.FullTokenizer( vocab_file=vocab_path, do_lower_case=do_lower_case) self.vocab = self.tokenizer.vocab self.pad_id = self.vocab["[PAD]"] self.cls_id = self.vocab["[CLS]"] self.sep_id = self.vocab["[SEP]"] self.mask_id = self.vocab["[MASK]"] self.in_tokens = in_tokens self.use_task_id = use_task_id if self.use_task_id: logger.warning( "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now." ) self.task_id = 0 self.Record_With_Label_Id = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids', 'label_id']) self.Record_Wo_Label_Id = namedtuple( 'Record', ['token_ids', 'text_type_ids', 'position_ids'])