def tokenizing_and_vocabulary(train_posts, test_posts, train_tags, test_tags): # 20 news groups num_labels = 20 vocab_size = 15000 batch_size = 100 #define Tokenizer with Vocab size tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(train_posts) x_train = tokenizer.text_to_matrix(train_posts, mode='tfidf') x_text = tokenizer.text_to_matrix(test_posts, mode='tfidf') encoder = LabelBinarizer() encoder.fit(train_tags) y_train = encoder.transform(train_tags) y_test = encoder.transform(test_tags)
from keras.preprocessing.text import Tokenizer samples = ['I study at CityU', 'I study at CityU at Seattle'] tokenizer = Tokenizer(num_words=1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.text_to_sequences(samples) one_hot_results = tokenizer.text_to_matrix(samples, mode='binary') word_index = tokenizer.word_index print('Found %s unique tokesn: ' % len(word_index)) print('Sequences: ', sequences, '\n') print('word_index: ', tokenizer.word_index)
import keras.preprocessing.text as T from keras.preprocessing.text import Tokenizer text1='some thing to eat' text2='some thing to drink' texts=[text1,text2] print T.text_to_word_sequence(text1) #['some', 'thing', 'to', 'eat'] print T.one_hot(text1,10) #[7, 9, 3, 4] print T.one_hot(text2,10) #[7, 9, 3, 1] tokenizer = Tokenizer(num_words=10) tokenzier.fit_on_text(texts) print tokenizer.word_count #[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)] print tokenizer.word_index #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5} print tokenizer.word_docs #{'some': 2, 'thing': 2, 'to': 2, 'drink': 1, 'eat': 1} print tokenizer.index_docs #{1: 2, 2: 2, 3: 2, 4: 1, 5: 1} print tokenizer.text_to_sequences(texts) #[[1, 2, 3, 4], [1, 2, 3, 5]] print tokenizer.text_to_matrix(texts) # [[ 0., 1., 1., 1., 1., 0., 0., 0., 0., 0.], [ 0., 1., 1., 1., 0., 1., 0., 0., 0., 0.]] import keras.preprocessing.sequence as S S.pad_sequences([[1,2,3]],10,padding='post') --------------------- 作者:vivian_ll 来源:CSDN 原文:https://blog.csdn.net/vivian_ll/article/details/80795139 版权声明:本文为博主原创文章,转载请附上博文链接!