예제 #1
0
    def __init__(self,
                 dataset,
                 vocab_path,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 random_seed=None):
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.dataset = dataset
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.in_tokens = False

        np.random.seed(random_seed)

        # generate label map
        self.label_map = {}
        for index, label in enumerate(self.dataset.get_labels()):
            self.label_map[label] = index
        logger.info("Dataset label map = {}".format(self.label_map))

        self.current_example = 0
        self.current_epoch = 0

        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
예제 #2
0
파일: nlp_reader.py 프로젝트: wuhuaha/beike
 def __init__(self,
              vocab_path,
              dataset=None,
              label_map_config=None,
              max_seq_len=512,
              do_lower_case=True,
              random_seed=None,
              use_task_id=False,
              sp_model_path=None,
              word_dict_path=None,
              in_tokens=False):
     super(SequenceLabelReader, self).__init__(
         vocab_path=vocab_path,
         dataset=dataset,
         label_map_config=label_map_config,
         max_seq_len=max_seq_len,
         do_lower_case=do_lower_case,
         random_seed=random_seed,
         use_task_id=use_task_id,
         sp_model_path=sp_model_path,
         word_dict_path=word_dict_path,
         in_tokens=in_tokens)
     if sp_model_path and word_dict_path:
         self.tokenizer = tokenization.FullTokenizer(
             vocab_file=vocab_path,
             do_lower_case=do_lower_case,
             use_sentence_piece_vocab=True)
예제 #3
0
    def __init__(self, dataset, vocab_path):
        self.dataset = dataset
        self.lac = hub.Module(name="lac")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path,
                                                    do_lower_case=False)
        self.vocab = self.tokenizer.vocab
        self.feed_key = list(
            self.lac.processor.data_format(
                sign_name="lexical_analysis").keys())[0]

        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
예제 #4
0
    def __init__(self, vocab_path, dataset=None, in_tokens=False):
        super(LACClassifyReader, self).__init__(dataset)
        self.in_tokens = in_tokens

        self.lac = hub.Module(name="lac")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_path,
                                                    do_lower_case=False)
        self.vocab = self.tokenizer.vocab
        self.feed_key = list(
            self.lac.processor.data_format(
                sign_name="lexical_analysis").keys())[0]
예제 #5
0
파일: nlp_reader.py 프로젝트: wuhuaha/beike
    def __init__(self, vocab_path, dataset=None, in_tokens=False):
        super(LACClassifyReader, self).__init__(dataset)
        self.in_tokens = in_tokens

        self.lac = hub.Module(name="lac")
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_path, do_lower_case=False)
        self.vocab = self.tokenizer.vocab
        self.has_processed = {
            "train": False,
            "dev": False,
            "val": False,
            "test": False,
            "predict": False
        }
예제 #6
0
    def _initialize(self):
        """
        initialize with the necessary elements
        """
        self.pretrained_model_path = os.path.join(self.directory, "infer_model")
        self.tokenizer_vocab_path = os.path.join(self.directory, "assets",
                                                 "vocab.txt")
        self.vocab_path = os.path.join(self.directory, "assets",
                                       "word_dict.txt")
        self.vocab = load_vocab(self.vocab_path)
        self.sequence_max_len = 256
        self.tokenizer = tokenization.FullTokenizer(self.tokenizer_vocab_path)

        self.param_file = os.path.join(self.directory, "assets", "params.txt")

        self._set_config()
예제 #7
0
    def __init__(self,
                 vocab_path,
                 dataset=None,
                 label_map_config=None,
                 max_seq_len=512,
                 do_lower_case=True,
                 random_seed=None,
                 use_task_id=False,
                 sp_model_path=None,
                 word_dict_path=None,
                 in_tokens=False):
        super(BaseNLPReader, self).__init__(dataset, random_seed)
        self.max_seq_len = max_seq_len
        if sp_model_path and word_dict_path:
            self.tokenizer = tokenization.WSSPTokenizer(vocab_path,
                                                        sp_model_path,
                                                        word_dict_path,
                                                        ws=True,
                                                        lower=True)
        else:
            self.tokenizer = tokenization.FullTokenizer(
                vocab_file=vocab_path, do_lower_case=do_lower_case)
        self.vocab = self.tokenizer.vocab
        self.pad_id = self.vocab["[PAD]"]
        self.cls_id = self.vocab["[CLS]"]
        self.sep_id = self.vocab["[SEP]"]
        self.mask_id = self.vocab["[MASK]"]
        self.in_tokens = in_tokens
        self.use_task_id = use_task_id

        if self.use_task_id:
            logger.warning(
                "use_task_id has been de discarded since PaddleHub v1.4.0, it's no necessary to feed task_ids now."
            )
            self.task_id = 0

        self.Record_With_Label_Id = namedtuple(
            'Record',
            ['token_ids', 'text_type_ids', 'position_ids', 'label_id'])
        self.Record_Wo_Label_Id = namedtuple(
            'Record', ['token_ids', 'text_type_ids', 'position_ids'])