Пример #1
0
def tokenizer(text):
    text = txt.text_cleaner(text)
    token = [t.text for t in nlp.tokenizer(text)]
    if len(token) < 6:
        for i in range(0, 6 - len(token)):
            token.append('<PAD>')
    return token
Пример #2
0
# X_test = np.array(X_test)
# X_test = np.array(X_test).ravel()
# num_classes = 46

# working
from keras.datasets import imdb

# this depends on machine computation capacity
MAX_NB_WORDS = 75000

print("Load IMDB dataset....")
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=MAX_NB_WORDS)
word_index = imdb.get_word_index()
index_word = {v: k for k, v in word_index.items()}
X_train = [
    txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_train
]
X_test = [
    txt.text_cleaner(' '.join(index_word.get(w) for w in x)) for x in X_test
]
X_train = np.array(X_train)
X_train = np.array(X_train).ravel()
print(X_train.shape)
X_test = np.array(X_test)
X_test = np.array(X_test).ravel()
num_classes = 2

# # working
# from sklearn.datasets import fetch_20newsgroups
#
# newsgroups_train = fetch_20newsgroups(subset='train')