Python Tokenizer.Config示例

编程语言: Python

命名空间/包名称: pytext.data.tokenizers

类/类型: Tokenizer

方法/功能: Config

hotexamples.com的示例: 9

Python Tokenizer.Config - 已找到9个示例。这些是从开源项目中提取的最受好评的pytext.data.tokenizers.Tokenizer.Config现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Tokenizer(20)

Config(9)

tokenize(6)

示例#1

显示文件

 class Config(Tensorizer.Config):
     #: The name of the slot label column to parse from the data source.
     slot_column: str = "slots"
     #: The name of the text column to parse from the data source.
     #: We need this to be able to generate tensors which correspond to input text.
     text_column: str = "text"
     #: The tokenizer to use to split input text into tokens. This should be
     #: configured in a way which yields tokens consistent with the tokens input to
     #: or output by a model, so that the labels generated by this tensorizer
     #: will match the indices of the model's tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: Whether to allow for unknown labels at test/prediction time
     allow_unknown: bool = False

示例#2

显示文件

 class Config(Tensorizer.Config):
     column: str = "text_seq"
     max_seq_len: Optional[int] = None
     #: sentence markers
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     #: list markers
     add_bol_token: bool = False
     add_eol_token: bool = False
     use_eol_token_for_bol: bool = False
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()

示例#3

显示文件

 class Config(BERTTensorizer.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     is_fairseq: bool = False
     pretraining: bool = False
     max_seq_len: Optional[int] = 256
     max_vocab: int = 95000
     min_count: int = 0
     language_columns: List[str] = ["language"]
     lang2id: Dict[str, int] = DEFAULT_LANG2ID_DICT
     reset_positions: bool = False
     has_language_in_data: bool = False
     use_language_embeddings: bool = True

示例#4

显示文件

 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     #: The max token length for input text.
     max_seq_len: Optional[int] = None
     #: The max byte length for a token.
     max_byte_len: int = 15
     #: Offset to add to all non-padding bytes
     offset_for_non_padding: int = 0
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False

示例#5

显示文件

 class Config(BERTTensorizerBase.Config):
     vocab_file: str = "/mnt/vol/nlp_technologies/xlm/vocab_xnli_15"
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     max_vocab: int = 95000
     min_count: int = 0
     # language identifiers for extracting the language from a row of data
     # during numberize
     language_column: str = "language"
     # language-to-id mapping used to obtain language embeddings
     lang2id: Dict[str, int] = LANG2ID_15
     # Controls whether language is being read from the data file (which
     # is what happens for finetuning) or being added during processing
     # (which is what happens during pretraining)
     has_language_in_data: bool = False
     # controls whether we train with language embeddings or not
     use_language_embeddings: bool = True

示例#6

显示文件

文件： tensorizers.py 项目： zdavid1995/pytext

 class Config(Tensorizer.Config):
     #: The name of the text column to parse from the data source.
     column: str = "text"
     #: The tokenizer to use to split input text into tokens.
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     add_bos_token: bool = False
     add_eos_token: bool = False
     use_eos_token_for_bos: bool = False
     max_seq_len: Optional[int] = None
     #: If False, will not create token vocab during initialization. The vocab will
     #: need to be set during model initialization (e.g. see WordEmbedding)
     build_vocab: bool = True
     vocab_file: str = ""
     #: The number of lines in the above provided vocab_file to add to the
     #: overall vocab
     vocab_file_size_limit: int = 0

示例#7

显示文件

文件： tensorizers_test.py 项目： twild-fb/pytext

 def setUp(self):
     self.data_source = SquadDataSource.from_config(
         SquadDataSource.Config(
             train_filename=tests_module.test_file("squad_tiny.json"),
             eval_filename=None,
             test_filename=None,
         ))
     self.tensorizer_with_wordpiece = SquadTensorizer.from_config(
         SquadTensorizer.Config(
             tokenizer=WordPieceTokenizer.Config(
                 wordpiece_vocab_path=
                 "pytext/data/test/data/wordpiece_1k.txt"),
             max_seq_len=250,
         ))
     self.tensorizer_with_alphanumeric = SquadTensorizer.from_config(
         SquadTensorizer.Config(
             tokenizer=Tokenizer.Config(split_regex=r"\W+"),
             max_seq_len=250))

示例#8

显示文件

文件： tensorizers.py 项目： omargamal510/pytext

 class Config(Tensorizer.Config):
     text_column: str = "text"
     dict_column: str = "dict"
     #: tokenizer to split text and create dict tensors of the same size.
     tokenizer: Tokenizer.Config = Tokenizer.Config()

示例#9

显示文件

 class Config(Tensorizer.Config):
     # BERT style models support multiple text inputs
     columns: List[str] = ["text"]
     tokenizer: Tokenizer.Config = Tokenizer.Config()
     vocab_file: str = ""
     max_seq_len: int = 256