Python Vocabulary.build_vocab示例

编程语言: Python

命名空间/包名称: fastNLP.core.vocabulary

类/类型: Vocabulary

方法/功能: build_vocab

hotexamples.com的示例: 7

Python Vocabulary.build_vocab - 已找到7个示例。这些是从开源项目中提取的最受好评的fastNLP.core.vocabulary.Vocabulary.build_vocab现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Vocabulary(30)

update(23)

build_vocab(7)

from_dataset(6)

add_word_lst(5)

to_index(5)

has_word(4)

index_dataset(4)

add(3)

add_word(3)

build_reverse_vocab(2)

to_word(2)

word2idx(2)

word_count(2)

_is_word_no_create_entry(1)

unknown_label(1)

示例#1

显示文件

文件： cr_loader.py 项目： zxlzr/fastNLP

    def process(self, paths, **kwargs):
        data_info = DataBundle()
        for name in ['train', 'test', 'dev']:
            data_info.datasets[name] = self.load(paths[name])

        config = Config()
        vocab = Vocabulary().from_dataset(*data_info.datasets.values(),
                                          field_name='sentences')
        vocab.build_vocab()
        word2id = vocab.word2idx

        char_dict = preprocess.get_char_dict(config.char_path)
        data_info.vocabs = vocab

        genres = {
            g: i
            for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])
        }

        for name, ds in data_info.datasets.items():
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[0],
                new_field_name='doc_np')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[1],
                new_field_name='char_index')
            ds.apply(
                lambda x: preprocess.doc2numpy(x['sentences'],
                                               word2id,
                                               char_dict,
                                               max(config.filter),
                                               config.max_sentences,
                                               is_train=name == 'train')[2],
                new_field_name='seq_len')
            ds.apply(lambda x: preprocess.speaker2numpy(
                x["speakers"], config.max_sentences, is_train=name == 'train'),
                     new_field_name='speaker_ids_np')
            ds.apply(lambda x: genres[x["doc_key"][:2]],
                     new_field_name='genre')

            ds.set_ignore_type('clusters')
            ds.set_padder('clusters', None)
            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre",
                         "char_index", "seq_len")
            ds.set_target("clusters")

        # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False)
        # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False)

        return data_info

示例#2

显示文件

class VocabProcessor(Processor):
    def __init__(self, field_name):
        super(VocabProcessor, self).__init__(field_name, None)
        self.vocab = Vocabulary()

    def process(self, *datasets):
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            for ins in dataset:
                tokens = ins[self.field_name]
                self.vocab.update(tokens)

    def get_vocab(self):
        self.vocab.build_vocab()
        return self.vocab

示例#3

显示文件

文件： cws_processor.py 项目： youtang1993/fastNLP

class VocabProcessor(Processor):
    def __init__(self, field_name, min_freq=1, max_size=None):

        super(VocabProcessor, self).__init__(field_name, None)
        self.vocab = Vocabulary(min_freq=min_freq, max_size=max_size)

    def process(self, *datasets):
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))

    def get_vocab(self):
        self.vocab.build_vocab()
        return self.vocab

    def get_vocab_size(self):
        return len(self.vocab)

示例#4

显示文件

文件： coreference.py 项目： isabella232/benchmark-3

    def process(self, data_bundle: DataBundle):
        r"""
        对load进来的数据进一步处理原始数据包含：raw_key,raw_speaker,raw_words,raw_clusters
        
        .. csv-table::
           :header: "raw_key", "raw_speaker","raw_words","raw_clusters"

           "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "[...]", "[...]","[...]","[...]"


        :param data_bundle:
        :return:
        """
        genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3))
        vocab.build_vocab()
        word2id = vocab.word2idx
        data_bundle.set_vocab(vocab, Const.INPUTS(0))
        if self.config.char_path:
            char_dict = get_char_dict(self.config.char_path)
        else:
            char_set = set()
            for i,w in enumerate(word2id):
                if i < 2:
                    continue
                for c in w:
                    char_set.add(c)

            char_dict = collections.defaultdict(int)
            char_dict.update({c: i for i, c in enumerate(char_set)})

        for name, ds in data_bundle.datasets.items():
            # genre
            ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0))

            # speaker_ids_np
            ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'),
                     new_field_name=Const.INPUTS(1))

            # sentences
            ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2))

            # doc_np
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[0],
                     new_field_name=Const.INPUTS(3))
            # char_index
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[1],
                     new_field_name=Const.CHAR_INPUT)
            # seq len
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[2],
                     new_field_name=Const.INPUT_LEN)

            # clusters
            ds.rename_field(Const.RAW_WORDS(2), Const.TARGET)

            ds.set_ignore_type(Const.TARGET)
            ds.set_padder(Const.TARGET, None)
            ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
            ds.set_target(Const.TARGET)

        return data_bundle

示例#5

显示文件

文件： dataloader.py 项目： choosewhatulike/fastNLP

    def process(self,
                paths,
                vocab_size,
                vocab_path,
                sent_max_len,
                doc_max_timesteps,
                domain=False,
                tag=False,
                load_vocab=True):
        """
        :param paths: dict  path for each dataset
        :param vocab_size: int  max_size for vocab
        :param vocab_path: str  vocab path
        :param sent_max_len: int    max token number of the sentence
        :param doc_max_timesteps: int   max sentence number of the document
        :param domain: bool  build vocab for publication, use 'X' for unknown
        :param tag: bool  build vocab for tag, use 'X' for unknown
        :param load_vocab: bool  build vocab (False) or load vocab (True)
        :return: DataInfo
            datasets: dict  keys correspond to the paths dict
            vocabs: dict  key: vocab(if "train" in paths), domain(if domain=True), tag(if tag=True)
            embeddings: optional
        """
        def _pad_sent(text_wd):
            pad_text_wd = []
            for sent_wd in text_wd:
                if len(sent_wd) < sent_max_len:
                    pad_num = sent_max_len - len(sent_wd)
                    sent_wd.extend([WORD_PAD] * pad_num)
                else:
                    sent_wd = sent_wd[:sent_max_len]
                pad_text_wd.append(sent_wd)
            return pad_text_wd

        def _token_mask(text_wd):
            token_mask_list = []
            for sent_wd in text_wd:
                token_num = len(sent_wd)
                if token_num < sent_max_len:
                    mask = [1] * token_num + [0] * (sent_max_len - token_num)
                else:
                    mask = [1] * sent_max_len
                token_mask_list.append(mask)
            return token_mask_list

        def _pad_label(label):
            text_len = len(label)
            if text_len < doc_max_timesteps:
                pad_label = label + [0] * (doc_max_timesteps - text_len)
            else:
                pad_label = label[:doc_max_timesteps]
            return pad_label

        def _pad_doc(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                padding = [WORD_PAD] * sent_max_len
                pad_text = text_wd + [padding] * (doc_max_timesteps - text_len)
            else:
                pad_text = text_wd[:doc_max_timesteps]
            return pad_text

        def _sent_mask(text_wd):
            text_len = len(text_wd)
            if text_len < doc_max_timesteps:
                sent_mask = [1] * text_len + [0] * (doc_max_timesteps -
                                                    text_len)
            else:
                sent_mask = [1] * doc_max_timesteps
            return sent_mask

        datasets = {}
        train_ds = None
        for key, value in paths.items():
            ds = self.load(value)
            # pad sent
            ds.apply(lambda x: _pad_sent(x["text_wd"]),
                     new_field_name="pad_text_wd")
            ds.apply(lambda x: _token_mask(x["text_wd"]),
                     new_field_name="pad_token_mask")
            # pad document
            ds.apply(lambda x: _pad_doc(x["pad_text_wd"]),
                     new_field_name="pad_text")
            ds.apply(lambda x: _sent_mask(x["pad_text_wd"]),
                     new_field_name="seq_len")
            ds.apply(lambda x: _pad_label(x["flatten_label"]),
                     new_field_name="pad_label")

            # rename field
            ds.rename_field("pad_text", Const.INPUT)
            ds.rename_field("seq_len", Const.INPUT_LEN)
            ds.rename_field("pad_label", Const.TARGET)

            # set input and target
            ds.set_input(Const.INPUT, Const.INPUT_LEN)
            ds.set_target(Const.TARGET, Const.INPUT_LEN)

            datasets[key] = ds
            if "train" in key:
                train_ds = datasets[key]

        vocab_dict = {}
        if load_vocab == False:
            logger.info("[INFO] Build new vocab from training dataset!")
            if train_ds == None:
                raise ValueError("Lack train file to build vocabulary!")

            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.from_dataset(train_ds, field_name=["text_wd", "summary_wd"])
            vocab_dict["vocab"] = vocabs
        else:
            logger.info("[INFO] Load existing vocab from %s!" % vocab_path)
            word_list = []
            with open(vocab_path, 'r', encoding='utf8') as vocab_f:
                cnt = 2  # pad and unk
                for line in vocab_f:
                    pieces = line.split("\t")
                    word_list.append(pieces[0])
                    cnt += 1
                    if cnt > vocab_size:
                        break
            vocabs = Vocabulary(max_size=vocab_size,
                                padding=WORD_PAD,
                                unknown=WORD_UNK)
            vocabs.add_word_lst(word_list)
            vocabs.build_vocab()
            vocab_dict["vocab"] = vocabs

        if domain == True:
            domaindict = Vocabulary(padding=None, unknown=DOMAIN_UNK)
            domaindict.from_dataset(train_ds, field_name="publication")
            vocab_dict["domain"] = domaindict
        if tag == True:
            tagdict = Vocabulary(padding=None, unknown=TAG_UNK)
            tagdict.from_dataset(train_ds, field_name="tag")
            vocab_dict["tag"] = tagdict

        for ds in datasets.values():
            vocab_dict["vocab"].index_dataset(ds,
                                              field_name=Const.INPUT,
                                              new_field_name=Const.INPUT)

        return DataInfo(vocabs=vocab_dict, datasets=datasets)

示例#6

显示文件

文件： cws_processor.py 项目： huziye/fastNLP_fork

class VocabIndexerProcessor(Processor):
    """
    根据DataSet创建Vocabulary，并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供
        new_added_field_name, 则覆盖原有的field_name.

    """
    def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
                 verbose=0, is_input=True):
        """

        :param field_name: 从哪个field_name创建词表，以及对哪个field_name进行index操作
        :param new_added_filed_name: index时，生成的index field的名称，如果不传入，则覆盖field_name.
        :param min_freq: 创建的Vocabulary允许的单词最少出现次数.
        :param max_size: 创建的Vocabulary允许的最大的单词数量
        :param verbose: 0, 不输出任何信息；1，输出信息
        :param bool is_input:
        """
        super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
        self.min_freq = min_freq
        self.max_size = max_size

        self.verbose =verbose
        self.is_input = is_input

    def construct_vocab(self, *datasets):
        """
        使用传入的DataSet创建vocabulary

        :param datasets: DataSet类型的数据，用于构建vocabulary
        :return:
        """
        self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
        for dataset in datasets:
            assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
            dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
        self.vocab.build_vocab()
        if self.verbose:
            print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))

    def process(self, *datasets, only_index_dataset=None):
        """
        若还未建立Vocabulary，则使用dataset中的DataSet建立vocabulary；若已经有了vocabulary则使用已有的vocabulary。得到vocabulary
            后，则会index datasets与only_index_dataset。

        :param datasets: DataSet类型的数据
        :param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index，不会被用于生成vocabulary。
        :return:
        """
        if len(datasets)==0 and not hasattr(self,'vocab'):
            raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
        if not hasattr(self, 'vocab'):
            self.construct_vocab(*datasets)
        else:
            if self.verbose:
                print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
        to_index_datasets = []
        if len(datasets)!=0:
            for dataset in datasets:
                assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
                to_index_datasets.append(dataset)

        if not (only_index_dataset is None):
            if isinstance(only_index_dataset, list):
                for dataset in only_index_dataset:
                    assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
                    to_index_datasets.append(dataset)
            elif isinstance(only_index_dataset, DataSet):
                to_index_datasets.append(only_index_dataset)
            else:
                raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))

        for dataset in to_index_datasets:
            assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
            dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
                          new_field_name=self.new_added_field_name, is_input=self.is_input)
        # 只返回一个，infer时为了跟其他processor保持一致
        if len(to_index_datasets) == 1:
            return to_index_datasets[0]

    def set_vocab(self, vocab):
        assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
        self.vocab = vocab

    def delete_vocab(self):
        del self.vocab

    def get_vocab_size(self):
        return len(self.vocab)

    def set_verbose(self, verbose):
        """
        设置processor verbose状态。

        :param verbose: int, 0，不输出任何信息；1，输出vocab 信息。
        :return:
        """
        self.verbose = verbose

示例#7

显示文件

文件： MatchingDataLoader.py 项目： zxlzr/fastNLP

    def process(
        self,
        paths: Union[str, Dict[str, str]],
        dataset_name: str = None,
        to_lower=False,
        seq_len_type: str = None,
        bert_tokenizer: str = None,
        cut_text: int = None,
        get_index=True,
        auto_pad_length: int = None,
        auto_pad_token: str = '<pad>',
        set_input: Union[list, str, bool] = True,
        set_target: Union[list, str, bool] = True,
        concat: Union[str, list, bool] = None,
    ) -> DataBundle:
        """
        :param paths: str或者Dict[str, str]。如果是str，则为数据集所在的文件夹或者是全路径文件名：如果是文件夹，
            则会从self.paths里面找对应的数据集名称与文件名。如果是Dict，则为数据集名称（如train、dev、test）和
            对应的全路径文件名。
        :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名，那么可以用dataset_name来定义
            这个数据集的名字，如果不定义则默认为train。
        :param bool to_lower: 是否将文本自动转为小写。默认值为False。
        :param str seq_len_type: 提供的seq_len类型，支持 ``seq_len`` ：提供一个数字作为句子长度； ``mask`` :
            提供一个0/1的mask矩阵作为句子长度； ``bert`` ：提供segment_type_id（第一个句子为0，第二个句子为1）和
            attention mask矩阵（0/1的mask矩阵）。默认值为None，即不提供seq_len
        :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
        :param int cut_text: 将长于cut_text的内容截掉。默认为None，即不截。
        :param bool get_index: 是否需要根据词表将文本转为index
        :param int auto_pad_length: 是否需要将文本自动pad到一定长度（超过这个长度的文本将会被截掉），默认为不会自动pad
        :param str auto_pad_token: 自动pad的内容
        :param set_input: 如果为True，则会自动将相关的field（名字里含有Const.INPUT的）设置为input，如果为False
            则不会将任何field设置为input。如果传入str或者List[str]，则会根据传入的内容将相对应的field设置为input，
            于此同时其他field不会被设置为input。默认值为True。
        :param set_target: set_target将控制哪些field可以被设置为target，用法与set_input一致。默认值为True。
        :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
            如果传入一个长度为4的list，则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
            传入字符串 ``bert`` ，则会采用bert的拼接方式，等价于['[CLS]', '[SEP]', '', '[SEP]'].
        :return:
        """
        if isinstance(set_input, str):
            set_input = [set_input]
        if isinstance(set_target, str):
            set_target = [set_target]
        if isinstance(set_input, bool):
            auto_set_input = set_input
        else:
            auto_set_input = False
        if isinstance(set_target, bool):
            auto_set_target = set_target
        else:
            auto_set_target = False
        if isinstance(paths, str):
            if os.path.isdir(paths):
                path = {
                    n: os.path.join(paths, self.paths[n])
                    for n in self.paths.keys()
                }
            else:
                path = {
                    dataset_name if dataset_name is not None else 'train':
                    paths
                }
        else:
            path = paths

        data_info = DataBundle()
        for data_name in path.keys():
            data_info.datasets[data_name] = self._load(path[data_name])

        for data_name, data_set in data_info.datasets.items():
            if auto_set_input:
                data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
            if auto_set_target:
                if Const.TARGET in data_set.get_field_names():
                    data_set.set_target(Const.TARGET)

        if to_lower:
            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(0)]],
                    new_field_name=Const.INPUTS(0),
                    is_input=auto_set_input)
                data_set.apply(
                    lambda x: [w.lower() for w in x[Const.INPUTS(1)]],
                    new_field_name=Const.INPUTS(1),
                    is_input=auto_set_input)

        if bert_tokenizer is not None:
            if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
                PRETRAIN_URL = _get_base_url('bert')
                model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
                model_url = PRETRAIN_URL + model_name
                model_dir = cached_path(model_url)
                # 检查是否存在
            elif os.path.isdir(bert_tokenizer):
                model_dir = bert_tokenizer
            else:
                raise ValueError(
                    f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

            words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
            with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
                lines = f.readlines()
            lines = [line.strip() for line in lines]
            words_vocab.add_word_lst(lines)
            words_vocab.build_vocab()

            tokenizer = BertTokenizer.from_pretrained(model_dir)

            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: tokenizer.tokenize(' '.join(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)

        if isinstance(concat, bool):
            concat = 'default' if concat else None
        if concat is not None:
            if isinstance(concat, str):
                CONCAT_MAP = {
                    'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
                    'default': ['', '<sep>', '', '']
                }
                if concat.lower() in CONCAT_MAP:
                    concat = CONCAT_MAP[concat]
                else:
                    concat = 4 * [concat]
            assert len(concat) == 4, \
                f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
                f'the end of first sentence, the begin of second sentence, and the end of second' \
                f'sentence. Your input is {concat}'

            for data_name, data_set in data_info.datasets.items():
                data_set.apply(
                    lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[
                        1]] + [concat[2]] + x[Const.INPUTS(1)] + [concat[3]],
                    new_field_name=Const.INPUT)
                data_set.apply(
                    lambda x: [w for w in x[Const.INPUT] if len(w) > 0],
                    new_field_name=Const.INPUT,
                    is_input=auto_set_input)

        if seq_len_type is not None:
            if seq_len_type == 'seq_len':  #
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'mask':
                for data_name, data_set in data_info.datasets.items():
                    for fields in data_set.get_field_names():
                        if Const.INPUT in fields:
                            data_set.apply(lambda x: [1] * len(x[fields]),
                                           new_field_name=fields.replace(
                                               Const.INPUT, Const.INPUT_LEN),
                                           is_input=auto_set_input)
            elif seq_len_type == 'bert':
                for data_name, data_set in data_info.datasets.items():
                    if Const.INPUT not in data_set.get_field_names():
                        raise KeyError(
                            f'Field ``{Const.INPUT}`` not in {data_name} data set: '
                            f'got {data_set.get_field_names()}')
                    data_set.apply(lambda x: [0] *
                                   (len(x[Const.INPUTS(0)]) + 2) + [1] *
                                   (len(x[Const.INPUTS(1)]) + 1),
                                   new_field_name=Const.INPUT_LENS(0),
                                   is_input=auto_set_input)
                    data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
                                   new_field_name=Const.INPUT_LENS(1),
                                   is_input=auto_set_input)

        if auto_pad_length is not None:
            cut_text = min(
                auto_pad_length,
                cut_text if cut_text is not None else auto_pad_length)

        if cut_text is not None:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if (Const.INPUT
                            in fields) or ((Const.INPUT_LEN in fields) and
                                           (seq_len_type != 'seq_len')):
                        data_set.apply(lambda x: x[fields][:cut_text],
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        data_set_list = [d for n, d in data_info.datasets.items()]
        assert len(data_set_list) > 0, f'There are NO data sets in data info!'

        if bert_tokenizer is None:
            words_vocab = Vocabulary(padding=auto_pad_token)
            words_vocab = words_vocab.from_dataset(
                *[d for n, d in data_info.datasets.items() if 'train' in n],
                field_name=[
                    n for n in data_set_list[0].get_field_names()
                    if (Const.INPUT in n)
                ],
                no_create_entry_dataset=[
                    d for n, d in data_info.datasets.items()
                    if 'train' not in n
                ])
        target_vocab = Vocabulary(padding=None, unknown=None)
        target_vocab = target_vocab.from_dataset(
            *[d for n, d in data_info.datasets.items() if 'train' in n],
            field_name=Const.TARGET)
        data_info.vocabs = {
            Const.INPUT: words_vocab,
            Const.TARGET: target_vocab
        }

        if get_index:
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x:
                            [words_vocab.to_index(w) for w in x[fields]],
                            new_field_name=fields,
                            is_input=auto_set_input)

                if Const.TARGET in data_set.get_field_names():
                    data_set.apply(
                        lambda x: target_vocab.to_index(x[Const.TARGET]),
                        new_field_name=Const.TARGET,
                        is_input=auto_set_input,
                        is_target=auto_set_target)

        if auto_pad_length is not None:
            if seq_len_type == 'seq_len':
                raise RuntimeError(
                    f'the sequence will be padded with the length {auto_pad_length}, '
                    f'so the seq_len_type cannot be `{seq_len_type}`!')
            for data_name, data_set in data_info.datasets.items():
                for fields in data_set.get_field_names():
                    if Const.INPUT in fields:
                        data_set.apply(
                            lambda x: x[fields] +
                            [words_vocab.to_index(words_vocab.padding)] *
                            (auto_pad_length - len(x[fields])),
                            new_field_name=fields,
                            is_input=auto_set_input)
                    elif (Const.INPUT_LEN
                          in fields) and (seq_len_type != 'seq_len'):
                        data_set.apply(lambda x: x[fields] + [0] *
                                       (auto_pad_length - len(x[fields])),
                                       new_field_name=fields,
                                       is_input=auto_set_input)

        for data_name, data_set in data_info.datasets.items():
            if isinstance(set_input, list):
                data_set.set_input(*[
                    inputs for inputs in set_input
                    if inputs in data_set.get_field_names()
                ])
            if isinstance(set_target, list):
                data_set.set_target(*[
                    target for target in set_target
                    if target in data_set.get_field_names()
                ])

        return data_info