def test_word_tokenizer(): tokenizer = WordTokenizer() dummy = "justo. Praesent luctus." assert tokenizer(dummy) == ["justo.", "Praesent", "luctus."] dummy = "" assert tokenizer(dummy) == []
def test_word_tokenizer(): tokenizer = WordTokenizer() dummy = "justo. Praesent luctus." assert tokenizer(dummy) == ['justo.', 'Praesent', 'luctus.'] dummy = "" assert tokenizer(dummy) == []
def __init__( self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, pad_token: Optional[str] = '<pad>', unk_token: Optional[str] = '<unk>', sos_token: Optional[str] = None, eos_token: Optional[str] = None, embeddings: Optional[str] = None, embeddings_format: str = 'glove', embeddings_binary: bool = False, unk_init_all: bool = False, drop_unknown: bool = False) -> None: """Initialize the TextField. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default WordTokenizer() lower : bool, optional If given, lowercase the input, by default False pad_token : str, optional Reserved padding token. Note that this object does not perform padding. Padding is done on the fly, when sampling. (defaults to '<pad>') unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') sos_token : str, optional Start of sentence tokens to add to the start of each sequence (defaults to '<sos>') eos : Iterable[str], optional List of end of sentence tokens to add to the end of each sequence (defaults to an empty list) embeddings : Optional[str], optional Path to pretrained embeddings, by default None embeddings_format : str, optional The format of the input embeddings, should be one of: 'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can be used to download embeddings hosted on gensim on the fly. See https://github.com/RaRe-Technologies/gensim-data for the list of available embedding aliases. embeddings_binary : bool, optional Whether the input embeddings are provided in binary format, by default False unk_init_all : bool, optional If True, every token not provided in the input embeddings is given a random embedding from a normal distribution. Otherwise, all of them map to the '<unk>' token. drop_unknown: bool Whether to drop tokens that don't have embeddings associated. Defaults to True. Important: this flag will only work when using embeddings. """ self.tokenizer = tokenizer or WordTokenizer() self.lower = lower self.pad = pad_token self.unk = unk_token self.sos = sos_token self.eos = eos_token self.embeddings = embeddings self.embeddings_format = embeddings_format self.embeddings_binary = embeddings_binary self.embedding_matrix: Optional[torch.Tensor] = None self.unk_init_all = unk_init_all self.drop_unknown = drop_unknown self.unk_numericals: Set[int] = set() self.vocab: Dict = odict() specials = [pad_token, unk_token, sos_token, eos_token] self.specials = [ special for special in specials if special is not None ] index = -1 for token in self.specials: self.vocab[token] = index = index + 1 self.register_attrs('vocab')
def test_ngram_tokenizer_equivalence(): t1 = NGramsTokenizer(1) t2 = WordTokenizer() assert t1(example) == t2(example)
def __init__( self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, pad_token: Optional[str] = '<pad>', unk_token: str = '<unk>', sos_token: Optional[str] = None, eos_token: Optional[str] = None, embeddings_info: Optional[EmbeddingsInformation] = None, embeddings: Optional[str] = None, embeddings_format: str = 'glove', embeddings_binary: bool = False, unk_init_all: bool = False, drop_unknown: bool = False, max_seq_len: Optional[int] = None, truncate_end: bool = False, setup_all_embeddings: bool = False) -> None: """Initialize the TextField. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default WordTokenizer() lower : bool, optional If given, lowercase the input, by default False pad_token : str, optional Reserved padding token. Note that this object does not perform padding. Padding is done on the fly, when sampling. (defaults to '<pad>') unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') sos_token : str, optional Start of sentence tokens to add to the start of each sequence (defaults to '<sos>') eos : Iterable[str], optional List of end of sentence tokens to add to the end of each sequence (defaults to an empty list) embeddings_info : EmbeddingsInformation, optional The embeddings information. By default None embeddings : str WIlL BE DEPRECATED SOON. USE 'from_embeddings' FACTORY INSTEAD. Path to pretrained embeddings or the embedding name in case format is gensim. embeddings_format : str, optional WIlL BE DEPRECATED SOON. USE 'from_embeddings' FACTORY INSTEAD. The format of the input embeddings, should be one of: 'glove', 'word2vec', 'fasttext' or 'gensim'. The latter can be used to download embeddings hosted on gensim on the fly. See https://github.com/RaRe-Technologies/gensim-data for the list of available embedding aliases. embeddings_binary : bool, optional WIlL BE DEPRECATED SOON. USE 'from_embeddings' FACTORY INSTEAD. Whether the input embeddings are provided in binary format, by default False unk_init_all : bool, optional If True, every token not provided in the input embeddings is given a random embedding from a normal distribution. Otherwise, all of them map to the '<unk>' token. drop_unknown: bool WIlL BE DEPRECATED SOON. USE 'from_embeddings' FACTORY INSTEAD. Whether to drop tokens that don't have embeddings associated. Defaults to True. Important: this flag will only work when using embeddings. max_seq_len: int, optional The maximum length possibly output by the process func. If len of input tokens is larger than this number - then the output will be truncated as a post processing step. truncate_end: bool Determines the window of observed text in process if the input is larger than max_seq_len. If this value is True the window starts from the end of the utterance. Defaults to False. example: max_seq_len=3, input_text=1 2 3 4 5 truncate_end=false: output=1 2 3 truncate_end=true: output=3 4 5 setup_all_embeddings: bool WIlL BE DEPRECATED SOON. USE 'from_embeddings' FACTORY INSTEAD. Controls if all words from the optional provided embeddings will be added to the vocabulary and to the embedding matrix. Defaults to False. """ if embeddings: if embeddings_info: raise ValueError( "Cannot submit embeddings information and use the embeddings parameters" + "simultaneously. Use the 'from_embeddings' factory instead." ) warnings.warn( "The embeddings-exclusive parameters " + "('embeddings', 'embeddings_format', 'embeddings_binary', " + "'setup_all_embeddings', 'drop_unknown', 'unk_init_all') " + "will be deprecated in a future release. " + "Please migrate to use the 'from_embeddings' factory.") embeddings_info = EmbeddingsInformation( embeddings=embeddings, embeddings_format=embeddings_format, embeddings_binary=embeddings_binary, build_vocab_from_embeddings=setup_all_embeddings, unk_init_all=unk_init_all, drop_unknown=drop_unknown) self.tokenizer = tokenizer or WordTokenizer() self.lower = lower self.pad = pad_token self.unk = unk_token self.sos = sos_token self.eos = eos_token self.embeddings_info = embeddings_info self.embedding_matrix: Optional[torch.Tensor] = None self.max_seq_len = max_seq_len self.truncate_end = truncate_end self.unk_numericals: Set[int] = set() self.vocab: Dict = odict() specials = [pad_token, unk_token, sos_token, eos_token] self.specials = [ special for special in specials if special is not None ] self.register_attrs('vocab')
def __init__( self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, pad_token: Optional[str] = '<pad>', unk_token: str = '<unk>', sos_token: Optional[str] = None, eos_token: Optional[str] = None, embeddings: Optional[str] = None, embeddings_format: str = 'glove', embeddings_binary: bool = False, model: Optional[KeyedVectors] = None, unk_init_all: bool = False, drop_unknown: bool = False, max_seq_len: Optional[int] = None, truncate_end: bool = False, setup_all_embeddings: bool = False) -> None: """Initialize the TextField. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default WordTokenizer() lower : bool, optional If given, lowercase the input, by default False pad_token : str, optional Reserved padding token. Note that this object does not perform padding. Padding is done on the fly, when sampling. (defaults to '<pad>') unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') sos_token : str, optional Start of sentence tokens to add to the start of each sequence (defaults to '<sos>') eos : Iterable[str], optional List of end of sentence tokens to add to the end of each sequence (defaults to an empty list) model : KeyedVectors, optional The embeddings model used for retrieving text embeddings, by default None unk_init_all : bool, optional If True, every token not provided in the input embeddings is given a random embedding from a normal distribution. Otherwise, all of them map to the '<unk>' token. drop_unknown: bool Whether to drop tokens that don't have embeddings associated. Defaults to True. Important: this flag will only work when using embeddings. max_seq_len: int, optional The maximum length possibly output by the process func. If len of input tokens is larger than this number - then the output will be truncated as a post processing step. truncate_end: bool Determines the window of observed text in process if the input is larger than max_seq_len. If this value is True the window starts from the end of the utterance. Defaults to False. example: max_seq_len=3, input_text=1 2 3 4 5 truncate_end=false: output=1 2 3 truncate_end=true: output=3 4 5 setup_all_embeddings: bool Controls if all words from the optional provided embeddings will be added to the vocabulary and to the embedding matrix. Defaults to False. """ if embeddings: if model: raise ValueError( "Cannot submit a model and use the embeddings parameters" + "simultaneously. Use the 'from_embeddings' factory instead." ) warnings.warn( "The embeddings-exclusive parameters " + "('embeddings', 'embeddings_format', 'embeddings_binary', " + "'setup_all_embeddings', 'drop_unknown', 'unk_init_all') will be " + "deprecated in a future release. " + "Please migrate to use the 'from_embeddings' factory.") model = get_embeddings(embeddings, embeddings_format, embeddings_binary) if setup_all_embeddings and not model: raise ValueError( "'setup_all_embeddings' cannot be enabled without passing embeddings." ) self.tokenizer = tokenizer or WordTokenizer() self.lower = lower self.pad = pad_token self.unk = unk_token self.sos = sos_token self.eos = eos_token self.model = model self.embedding_matrix: Optional[torch.Tensor] = None self.unk_init_all = unk_init_all self.drop_unknown = drop_unknown self.setup_all_embeddings = setup_all_embeddings self.max_seq_len = max_seq_len self.truncate_end = truncate_end self.unk_numericals: Set[int] = set() self.vocab: Dict = odict() specials = [pad_token, unk_token, sos_token, eos_token] self.specials = [ special for special in specials if special is not None ] self.register_attrs('vocab')