def tokenizer(text): text = txt.text_cleaner(text) token = [t.text for t in nlp.tokenizer(text)] if len(token) < 6: for i in range(0, 6 - len(token)): token.append('<PAD>') return token
# X_test = np.array(X_test) # X_test = np.array(X_test).ravel() # num_classes = 46 # working from keras.datasets import imdb # this depends on machine computation capacity MAX_NB_WORDS = 75000 print("Load IMDB dataset....") (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=MAX_NB_WORDS) word_index = imdb.get_word_index() index_word = {v: k for k, v in word_index.items()} X_train = [ txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_train ] X_test = [ txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_test ] X_train = np.array(X_train) X_train = np.array(X_train).ravel() print(X_train.shape) X_test = np.array(X_test) X_test = np.array(X_test).ravel() num_classes = 2 # # working # from sklearn.datasets import fetch_20newsgroups # # newsgroups_train = fetch_20newsgroups(subset='train')