def test_prepare_data_use_full_sentences(): """It should prepare song data and use full sentences""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, transform_words=True, use_full_sentences=True ) # Basic assumption are the same as the previous test. assert seq_length == 23 assert num_words == 26 # The first X will contain no zeros hello = tokenizer.word_index["hello"] world = tokenizer.word_index["world"] cant = tokenizer.word_index["can't"] me = tokenizer.word_index["me"] assert np.all(x[0]) # "can't" is the 24th word so with a sequence length of 23, it should be the first output. assert x[0][0] == hello assert x[0][1] == world assert y[0] == cant # "world" should be the first word in the second input sentence. assert x[1][0] == world # "me" is the last word so it should be the output of the last sequence... assert y[-1] == me
def test_prepare_data_transform_words(): """It should prepare song data and transform words""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, transform_words=True ) # Average length is five words # Median length is also five words # We should thus expect a sequence length of 23 (4 sentences + 3 newline character) assert seq_length == 23 # There are 25 words plus the newline character... assert num_words == 26 # The newline character should be in the tokenizer's word index assert "\n" in tokenizer.word_index # "Cannot" should not exist anymore because of the transformed words assert "cannot" not in tokenizer.word_index assert "Cannot" not in tokenizer.word_index assert "can't" in tokenizer.word_index # The first X will contain just the "hello" word and the target would be "world" hello = tokenizer.word_index["hello"] world = tokenizer.word_index["world"] assert x[0][-1] == hello assert y[0] == world
def test_prepare_data_transform_words(): """It should prepare song data and transform words""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, transform_words=True) # Average length is five words # Median length is also five words # We should thus expect a sequence length of 23 (4 sentences + 3 newline character) # Note that the default max repeats 2 removes the last sentence assert seq_length == 23 # There are 25 words plus the newline character... assert num_words == 26 # The newline character should be in the tokenizer's word index assert "\n" in tokenizer.word_index # "Cannot" should not exist anymore because of the transformed words assert "cannot" not in tokenizer.word_index assert "Cannot" not in tokenizer.word_index assert "can't" in tokenizer.word_index # The first X will contain just the "hello" word and the target would be "world" hello = tokenizer.word_index["hello"] world = tokenizer.word_index["world"] assert x[0][0] == hello assert y[0] == world # The last sequence should end with i can't run \n i got me i got because # the last "me" is a target and the the repeat sentence should be deleted. assert tokenizer.sequences_to_texts( x[-1:])[0].endswith("i can't run \n i got me \n i got")
def test_prepare_data_num_lines(): """It should prepare song data and take a specific number of lines""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, num_lines_to_include=5) # Average length is five words # Median length is also five words # We should thus expect a sequence length of 29 (5 sentences + 4 newline character) assert seq_length == 29
def test_prepare_data_char_level(): """It should prepare song data and character level data""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, char_level=True) # The vocabulary is slightly smaller than the above test because it consists # of characters. assert num_words == 23 # The first X will contain no zeros h = tokenizer.word_index["h"] e = tokenizer.word_index["e"] assert x[0][0] == h assert y[0] == e # "me" is the last word so it should be the output of the last sequence... assert y[-1] == e
def test_prepare_data_use_strings(): """It should prepare song data and return sentences strings""" songs = [longer_song] x, y, seq_length, num_words, tokenizer = train.prepare_data( songs, transform_words=True, use_full_sentences=True, use_strings=True) # Basic assumption are the same as the previous test. assert seq_length == 23 assert num_words == 26 # "can't" is the 24th word so with a sequence length of 23, it should be the first output. cant = tokenizer.word_index["can't"] assert y[0] == cant # Make sure we have strings assert type(x[0]) == str assert ( x[0] == "hello world is a dream \n i know when i been like your love \n and i can't go home \n \n i" )