def test_ngram_tokenizer_equivalence_2(): t = NGramsTokenizer([1, 2, 3]) ret = [] for i in [1, 2, 3]: ret.extend(NGramsTokenizer(i)(example)) assert t(example) == ret
def test_ngram_tokenizer_stopwords(): tokenizer = NGramsTokenizer(2, exclude_stopwords=True) dummy = "justo. Praesent the luctus." assert tokenizer(dummy) == ['justo .', '. Praesent', 'Praesent luctus', 'luctus .'] tokenizer = NGramsTokenizer(1, exclude_stopwords=True) dummy = "justo. Praesent the luctus." assert tokenizer(dummy) == ['justo', '.', 'Praesent', 'luctus', '.'] tokenizer = NGramsTokenizer(2, exclude_stopwords=True, stop_words=["Praesent", "the"]) dummy = "justo. Praesent the luctus." assert tokenizer(dummy) == ['justo .', '. luctus', 'luctus .']
def test_ngram_tokenizer(): tokenizer = NGramsTokenizer(2) dummy = "justo. Praesent luctus." assert tokenizer(dummy) == ["justo. Praesent", "Praesent luctus."] dummy = "" assert tokenizer(dummy) == []
def test_ngram_tokenizer(): tokenizer = NGramsTokenizer(2) dummy = "justo. Praesent luctus." assert tokenizer(dummy) == ['justo .', '. Praesent', 'Praesent luctus', 'luctus .'] dummy = "" assert tokenizer(dummy) == []
def __init__(self, # nosec tokenizer: Optional[Tokenizer] = None, lower: bool = False, unk_token: str = '<unk>', min_freq: int = 5, normalize: bool = False, scale_factor: float = None) -> None: """Initialize the BoW object. Parameters ---------- tokenizer : Tokenizer, optional Tokenizer to use, by default NGramsTokenizer() lower : bool, optional If given, lowercase the input, by default False unk_token : str, optional The token to use for out of vocabulary tokens (defaults to '<unk>') min_freq : int, optional Minimum frequency to include token in the vocabulary (defaults to 5) normalize : bool, optional Normalize or not the bag of words using L1 norm (defaults to False) scale_factor : float, optional Factor to scale the resulting normalized feature value. Only available when normalize is True (defaults to 1.0) """ self.tokenizer = tokenizer or NGramsTokenizer() self.lower = lower self.unk = unk_token self.min_freq = min_freq self.normalize = normalize self.scale_factor = scale_factor self.vocab: Dict[str, int] = odict() self.vocab[unk_token] = 0 self.full_vocab: Dict[str, int] = {} if scale_factor and not normalize: raise ValueError(f"Cannot specify scale_factor without normalizing") self.register_attrs('vocab', 'full_vocab')
def test_ngram_tokenizer_equivalence(): t1 = NGramsTokenizer(1) t2 = WordTokenizer() assert t1(example) == t2(example)