示例#1
0
def test_ngram_tokenizer_equivalence_2():
    t = NGramsTokenizer([1, 2, 3])

    ret = []
    for i in [1, 2, 3]:
        ret.extend(NGramsTokenizer(i)(example))

    assert t(example) == ret
示例#2
0
def test_ngram_tokenizer_stopwords():
    tokenizer = NGramsTokenizer(2, exclude_stopwords=True)

    dummy = "justo. Praesent the luctus."
    assert tokenizer(dummy) == ['justo .', '. Praesent', 'Praesent luctus', 'luctus .']

    tokenizer = NGramsTokenizer(1, exclude_stopwords=True)

    dummy = "justo. Praesent the luctus."
    assert tokenizer(dummy) == ['justo', '.', 'Praesent', 'luctus', '.']

    tokenizer = NGramsTokenizer(2, exclude_stopwords=True, stop_words=["Praesent", "the"])

    dummy = "justo. Praesent the luctus."
    assert tokenizer(dummy) == ['justo .', '. luctus', 'luctus .']
示例#3
0
def test_ngram_tokenizer():
    tokenizer = NGramsTokenizer(2)

    dummy = "justo. Praesent luctus."
    assert tokenizer(dummy) == ["justo. Praesent", "Praesent luctus."]
    dummy = ""
    assert tokenizer(dummy) == []
示例#4
0
def test_ngram_tokenizer():
    tokenizer = NGramsTokenizer(2)

    dummy = "justo. Praesent luctus."
    assert tokenizer(dummy) == ['justo .', '. Praesent', 'Praesent luctus', 'luctus .']
    dummy = ""
    assert tokenizer(dummy) == []
示例#5
0
    def __init__(self,  # nosec
                 tokenizer: Optional[Tokenizer] = None,
                 lower: bool = False,
                 unk_token: str = '<unk>',
                 min_freq: int = 5,
                 normalize: bool = False,
                 scale_factor: float = None) -> None:
        """Initialize the BoW object.

        Parameters
        ----------
        tokenizer : Tokenizer, optional
            Tokenizer to use, by default NGramsTokenizer()
        lower : bool, optional
            If given, lowercase the input, by default False
        unk_token : str, optional
            The token to use for out of vocabulary tokens
            (defaults to '<unk>')
        min_freq : int, optional
            Minimum frequency to include token in the vocabulary
            (defaults to 5)
        normalize : bool, optional
            Normalize or not the bag of words using L1 norm
            (defaults to False)
        scale_factor : float, optional
            Factor to scale the resulting normalized feature value.
            Only available when normalize is True (defaults to 1.0)

        """
        self.tokenizer = tokenizer or NGramsTokenizer()
        self.lower = lower
        self.unk = unk_token

        self.min_freq = min_freq
        self.normalize = normalize
        self.scale_factor = scale_factor

        self.vocab: Dict[str, int] = odict()
        self.vocab[unk_token] = 0
        self.full_vocab: Dict[str, int] = {}

        if scale_factor and not normalize:
            raise ValueError(f"Cannot specify scale_factor without normalizing")

        self.register_attrs('vocab', 'full_vocab')
示例#6
0
def test_ngram_tokenizer_equivalence():
    t1 = NGramsTokenizer(1)
    t2 = WordTokenizer()

    assert t1(example) == t2(example)