def test_char_2d_valid_length(vocab): mxlen, mxwlen = np.random.randint(3, 15, size=2) my_len = np.random.randint(1, mxlen) input_ = ['a'] * my_len vect = Char2DVectorizer(mxlen=mxlen, mxwlen=mxwlen) _, lens = vect.run(input_, vocab) assert lens == my_len
def test_char_2d_cuts_off_mxwlen(vocab): mxlen = 2; mxwlen = 4 input_ = ['aaaabbbb', 'cccc'] gold = np.array([[vocab['a']] * mxwlen, [vocab['c']] * mxwlen], dtype=int) vect = Char2DVectorizer(mxlen=mxlen, mxwlen=mxwlen) res, _ = vect.run(input_, vocab) np.testing.assert_equal(res, gold)
def test_char_2d_run_values(vocab): mxlen, mxwlen = np.random.randint(3, 15, size=2) input_ = [chr(i + 97) * mxwlen for i in range(mxlen)] vect = Char2DVectorizer(mxlen=mxlen, mxwlen=mxwlen) res, _ = vect.run(input_, vocab) for i, word in enumerate(input_): for j, char in enumerate(word): assert res[i, j] == vocab[char]
def __init__(self, nctx, chars_per_word): y_vectorizer = Token1DVectorizer(transform_fn=baseline.lowercase) x_vectorizer = Char2DVectorizer(mxwlen=chars_per_word) super(TensorCharDatasetReader, self).__init__(nctx, { 'x': x_vectorizer, 'y': y_vectorizer }) self.chars_per_word = chars_per_word
def test_char_2d_cuts_off_mxlen(vocab): mxlen = 2; mxwlen = 4 input_ = ['a', 'b', 'c'] vect = Char2DVectorizer(mxlen=mxlen, mxwlen=mxwlen) res, _ = vect.run(input_, vocab) assert res.shape[0] == mxlen for i, char in enumerate(input_[:mxlen]): assert res[i, 0] == vocab[char] values = set(res.flatten().tolist()) for char in input_[mxlen:]: assert vocab[char] not in values
def test_char_2d_shapes(vocab): mxlen, mxwlen = np.random.randint(1, 100, size=2) gold_shape = (mxlen, mxwlen) vect = Char2DVectorizer(mxlen=mxlen, mxwlen=mxwlen) res, _ = vect.run([''], vocab) assert res.shape == gold_shape