Пример #1
0
def test_prepare_tokenizer_profanity():
    """It should accept the profanity censorship."""
    songs = ["ok ok ok **** go go"]
    tokenizer = util.prepare_tokenizer(["ok ok ok **** go go"])
    assert len(tokenizer.word_index) == 3
    assert tokenizer.word_index == {"ok": 1, "go": 2, "****": 3}
    sentences = tokenizer.texts_to_sequences(songs)
    assert sentences[0] == [1, 1, 1, 3, 2, 2]
def test_prepare_tokenizer(songs):
    """It should tokenize newlines and include all words."""
    tokenizer = util.prepare_tokenizer(songs)
    assert len(tokenizer.word_index) == 4
    assert tokenizer.word_index == {"\n": 1, "woof": 2, "meow": 3, "chorus": 4}

    sentences = tokenizer.texts_to_sequences(songs)

    # The songs fixture has been carefully crafted, didn't you notice? :-)
    # 0 is reserved, 1 is newline, 2 is woof, 3 is meow, 4 is chorus
    assert sentences[0] == [3, 1, 3]
    assert sentences[1] == [2, 1, 1, 4, 1, 2, 2]
def test_prepare_tokenizer_limit_words(songs):
    """It should tokenize newlines."""
    tokenizer = util.prepare_tokenizer(songs, num_words=2)

    # So interestingly, keras keeps track of all words. It's not until turning
    # sentences into sequences that the num_words parameter kicks in
    assert len(tokenizer.word_index) == 4

    sentences = tokenizer.texts_to_sequences(songs)

    # 0 is reserved, 1 is newline, 2 is woof, the others are not included so they will be 0
    assert sentences[0] == [1]
    assert sentences[1] == [2, 1, 1, 1, 2, 2]
def test_create_embedding_matrix(songs, embedding_mapping):
    """It should create a dictionary of embedding mappings."""
    num_words = 2
    tokenizer = util.prepare_tokenizer(songs, num_words=num_words)

    embedding_matrix = embedding.create_embedding_matrix(
        tokenizer,
        embedding_mapping,
        max_num_words=num_words,
        embedding_dim=3)

    # Only woof is known
    np.testing.assert_array_equal(embedding_matrix, [
        [0, 0, 0], # OOV
        [0, 0, 0], # \n
        [0.1, 0.2, 0.3] # woof
        #[0, 0, 0], # meow, absent, because we only choose 2 words
        #[0, 0, 0], # chorus, absent, same reason
    ])
Пример #5
0
def test_prepare_tokenizer_char_level(songs):
    """It should tokenize at character level."""
    tokenizer = util.prepare_tokenizer(songs, char_level=True)
    # 12 characters = ['\n', ' ', 'c', 'e', 'f', 'h', 'm', 'o', 'r', 's', 'u', 'w']
    assert len(tokenizer.word_index) == 12