Пример #1
0
    def predict_one_sentence(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence],
                                                   search_related_word=True)
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print(
                        "second best prediction is ",
                        reverse_word_index[np.argmax(np.delete(token,
                                                               max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        tokenizer = Tokenizer()
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy')
        self.word_index = self.word_index.item()
        tokenizer.word_index = self.word_index
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        sentence = tokenizer.texts_to_sequences([sentence])
        sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]]
        sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction
    def predict_batch(self, sentences):
        self.__setup_model()

        tokenizer = Tokenizer()
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy')
        self.word_index = self.word_index.item()
        tokenizer.word_index = self.word_index
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        sentences = tokenizer.texts_to_sequences(sentences)
        mod_sentences = []
        for sentence in sentences:
            mod_sentences.append([self.word_index[self.START_TOKEN]] + sentence + [self.word_index[self.END_TOKEN]])
        sentences = pad_sequences(mod_sentences, maxlen=self.params['max_seq_length'], padding='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict((i, word) for word, i in self.word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo nicht multiple von batchsize trotzdem predicten
                break
        return predicted_sentences
Пример #4
0
    def tokenizer(cls, texts):
        tokenizer = Tokenizer(filters=DELIMITERS, lower=True)
        tokenizer.fit_on_texts(texts)

        corpus_vocabulary_size = len(tokenizer.word_index)
        # If our vocabulary size exceeds the maximum allowed vocab size we
        # need to limit it to a smaller number. Otherwise we just use it as
        # `num_words` for our tokenize
        tokenizer.num_words = min(cls.MAX_VOCAB_SIZE, corpus_vocabulary_size)

        return tokenizer
Пример #5
0
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentence)
        sentence = en_tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['MAX_SEQ_LEN'],
                                 padding='post',
                                 truncating='post')
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)

        encoder_name = 'encoder'

        encoder = Model(inputs=self.M.input,
                        outputs=self.M.get_layer(encoder_name).output)

        prediction = encoder.predict(sentence, batch_size=1)
        print(prediction.shape)
        return prediction
    def predict_one_sentence(self, sentence):
        self.__setup_model()
        tokenizer = Tokenizer()
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy')
        self.word_index = self.word_index.item()
        tokenizer.word_index = self.word_index
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()
        print(sentence)
        sentence = tokenizer.texts_to_sequences([sentence])
        print(sentence)
        sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]]
        print(sentence)
        sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post')
        print(sentence)
        sentence = sentence.reshape(sentence.shape[0], sentence.shape[1])
        print(sentence)
        prediction = self.M.predict(sentence)

        predicted_sentence = ""
        reverse_word_index = dict((i, word) for word, i in self.word_index.items())
        for sentence in prediction:
            for token in sentence:
                max_idx = np.argmax(token)
                if max_idx == 0:
                    print("id of max token = 0")
                    print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))])
                else:
                    next_word = reverse_word_index[max_idx]
                    if next_word == self.END_TOKEN:
                        break
                    elif next_word == self.START_TOKEN:
                        continue
                    predicted_sentence += next_word + " "

        return predicted_sentence
Пример #7
0
def Get_Tokens(Train_Data):
    DataTokenizer = None

    TokensOk = False
    if os.path.exists('Tokens.pkl'):
        with open('Tokens.pkl', 'rb') as Handle:
            DataTokenizer = pickle.load(Handle)
            TokensOk = True

    if not TokensOk:  #Los tokens los calculamos al principio una vez con el Train Dataset
        print("Tokenizer...")
        input("Confirma generar nuevos token 1?")
        input("Confirma generar nuevos token 2?")
        input("Confirma generar nuevos token 3?")
        DataTokenizer = Tokenizer(char_level=False, num_words=50000)
        print("Fitting tokenizer...")
        DataTokenizer.fit_on_texts(Train_Data.title.values)
        DataTokenizer.num_words = 50000
        with open('Tokens.pkl', 'wb') as handle:
            pickle.dump(DataTokenizer,
                        handle,
                        protocol=pickle.HIGHEST_PROTOCOL)

    return DataTokenizer
Пример #8
0
		dataY.append(entry[0])

	return dataX, dataY

dataset = load_data()

encoder = LabelEncoder()
encoder.fit(dataset['type'].values)


train=dataset.sample(frac=0.8,random_state=200)
test=dataset.drop(train.index)
  
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts(train['posts'].values)
tokenizer.num_words = 400
  
train['Y'] = encoder.transform(train['type'].values)
test['Y'] = encoder.transform(test['type'].values)

train_x, train_y = zip(*train[['posts', 'Y']].values)

test_x, test_y =zip(*test[['posts','Y']].values)
test_x = np.array(test_x)
test_y = np.array(test_y)
print(test_x)
print(test_y)

print("=======================>")
train_x = np.array(train_x)
train_y = np.array(train_y)
Пример #9
0
    def predict_batch(self, sentences):
        self.__setup_model()

        self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/en_word_index.npy')
        self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                     '/de_word_index.npy')

        en_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_EN'])
        en_tokenizer.word_index = self.en_word_index
        en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3

        de_tokenizer = Tokenizer(self.START_TOKEN,
                                 self.END_TOKEN,
                                 self.UNK_TOKEN,
                                 num_words=self.params['MAX_WORDS_DE'])
        de_tokenizer.word_index = self.de_word_index
        de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3

        print(sentences)
        sentences = en_tokenizer.texts_to_sequences(sentences)
        print(sentences)
        sentences = pad_sequences(sentences,
                                  maxlen=self.params['MAX_SEQ_LEN'],
                                  padding='post',
                                  truncating='post')
        sentences = sentences.reshape(sentences.shape[0], sentences.shape[1])

        batch_size = sentences.shape[0]
        if batch_size > 10:
            batch_size = 10

        reverse_word_index = dict(
            (i, word) for word, i in self.de_word_index.items())
        predicted_sentences = []
        from_idx = 0
        to_idx = batch_size
        while True:
            print("from_idx, to_idx, hm_sentences", from_idx, to_idx,
                  sentences.shape[0])
            current_batch = sentences[from_idx:to_idx]
            prediction = self.M.predict(current_batch, batch_size=batch_size)

            for sentence in prediction:
                predicted_sent = ""
                for token in sentence:
                    max_idx = np.argmax(token)
                    if max_idx == 0:
                        print("id of max token = 0")
                        print(
                            "second best prediction is ",
                            reverse_word_index[np.argmax(
                                np.delete(token, max_idx))])
                    else:
                        next_word = reverse_word_index[max_idx]
                        if next_word == self.END_TOKEN:
                            break
                        elif next_word == self.START_TOKEN:
                            continue
                        predicted_sent += next_word + " "
                predicted_sentences.append(predicted_sent)
            from_idx += batch_size
            to_idx += batch_size
            if to_idx > sentences.shape[0]:
                # todo accept not multiple of batchsize
                break
        return predicted_sentences
Пример #10
0
if __name__ == '__main__':
    import tensorflow as tf
    import re
    import pickle
    from data_utils.constants import ALL_TEXTS
    from keras.preprocessing.text import Tokenizer
    with open(ALL_TEXTS, 'r') as file:
        word_tokenizer = Tokenizer(
            filters='\t\n', lower=True, oov_token='<UNK>'
        )
        lines = file.readlines()
        word_tokenizer.fit_on_texts(lines)
        word_tokenizer.num_words = len(
            [x for x in word_tokenizer.word_counts.values() if x >= 5]) + 1
        word_tokenizer.word_index = dict(
            (k, v) for k, v in word_tokenizer.word_index.items() if v < word_tokenizer.num_words)
        word_tokenizer.word_index[word_tokenizer.oov_token] = word_tokenizer.num_words
        print(word_tokenizer.word_index)
        print('Word tokenizer num words:', word_tokenizer.num_words)
        print(word_tokenizer.texts_to_sequences(['asdf efas Huflit']))
        char_tokenizer = Tokenizer(
            filters='\t\n', lower=True, char_level=True, oov_token='<UNK>'
        )
        char_tokenizer.fit_on_texts(re.sub(r'\s', '', line) for line in lines)
        with open('./output/word_tokenizer.pkl', 'wb') as file:
            pickle.dump(word_tokenizer, file, pickle.HIGHEST_PROTOCOL)
        with open('./output/char_tokenizer.pkl', 'wb') as file:
            pickle.dump(char_tokenizer, file, pickle.HIGHEST_PROTOCOL)
Пример #11
0
#%%
"""one hot"""

rev2['negative']=rev2['sentiment']
rev2['negative'].replace([1,0], [0,1], inplace = True)
rev2=rev2.rename(index=str, columns={"sentiment": "positive"})
"""stop word"""
nltk.download('stopwords')
stop = stopwords.words("english") 
rev2['review'] = rev2['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

review_train, review_test, label_train, label_test = train_test_split(rev2['review'],rev2.loc[:,['positive','negative']], test_size=0.2,random_state=13,stratify=rev['sentiment'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(review_train)
tokenizer.num_words=2000
X_train = tokenizer.texts_to_sequences(review_train)
X_test = tokenizer.texts_to_sequences(review_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index


maxlen = 15

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

X_train = tokenizer.sequences_to_texts(X_train)
X_test = tokenizer.sequences_to_texts(X_test)

tfidf_vectorizer = TfidfVectorizer()
Пример #12
0
train = pd.read_csv('./data/toxic_comments/train.csv')
sentences = train["comment_text"].fillna("DUMMY_VALUE").values
possible_labels = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]
targets = train[possible_labels].values

print("Max sentence length:", max(len(s) for s in sentences))
print("Min sentence length:", min(len(s) for s in sentences))
s = sorted(len(s) for s in sentences)
print("Median sequence length:", s[len(s) // 2])

tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, oov_token='UNKOWN')
tokenizer.fit_on_texts(sentences)
tokenizer.num_words = MAX_VOCAB_SIZE
sequences = tokenizer.texts_to_sequences(sentences)
# print ("sequences : ", sequences); exit()

word2idx = tokenizer.word_index
print("Number of unique words:%s" % len(word2idx))

# sequences should be padded
data = pad_sequences(
    sequences, maxlen=MAX_SEQUENCE_LENGTH
)  # 1. Pre-padding is default 2. Tokenizer indexes start from 1 and 0 is reserved for padding
# IMPORTANT: Default for padding and truncating is 'pre', because the last words are more important when we are going to take into account the prediction after the last word
print("Shaoe of data tensor: ", data.shape)

print("Filling pre-trained embeddings...")
num_words = min(
Пример #13
0
def get_tfidf(tokens, top_n_words):
    t = Tokenizer()
    t.fit_on_texts(tokens)
    t.num_words=top_n_words+1
    tfidf = t.texts_to_matrix(tokens, 'tfidf')
    return tfidf
    def _split_count_data(self):
        self.input_texts = []
        self.target_texts = []
        lines = open(self.data_path, encoding='UTF-8').read().split('\n')
        for line in lines[: min(self.params['num_samples'], len(lines) - 1)]:
            input_text, target_text = line.split('\t')
            self.input_texts.append(input_text)
            target_text = target_text
            self.target_texts.append(target_text)
        self.num_samples = len(self.input_texts)
        tokenizer = Tokenizer(num_words=self.params['MAX_WORDS'])
        tokenizer.fit_on_texts(self.input_texts + self.target_texts)
        self.word_index = tokenizer.word_index
        for word in tokenizer.word_index:
            tokenizer.word_index[word] = tokenizer.word_index[word] + 3
        tokenizer.word_index[self.START_TOKEN] = 1
        tokenizer.word_index[self.END_TOKEN] = 2
        tokenizer.word_index[self.UNK_TOKEN] = 3
        tokenizer.num_words = tokenizer.num_words + 3
        self.word_index = tokenizer.word_index

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        self.input_texts = tokenizer.texts_to_sequences(self.input_texts)
        self.target_texts = tokenizer.texts_to_sequences(self.target_texts)
        for idx in range(len(self.target_texts)):
            self.input_texts[idx] = [self.word_index[self.START_TOKEN]] + self.input_texts[idx] + [
                self.word_index[self.END_TOKEN]]
            self.target_texts[idx] = [self.word_index[self.START_TOKEN]] + self.target_texts[idx] + [
                self.word_index[self.END_TOKEN]]
            if self.target_texts[idx][0] != 1:
                print(idx)
                print(self.target_texts[idx])
                exit(-1)

        self.input_texts = pad_sequences(self.input_texts, maxlen=self.params['max_seq_length'], padding='post')
        self.target_texts = pad_sequences(self.target_texts, maxlen=self.params['max_seq_length'], padding='post')

        embeddings_index = {}
        filename = self.PRETRAINED_GLOVE_FILE
        with open(filename, 'r', encoding='utf8') as f:
            for line in f.readlines():
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs

        print('Found %s word vectors.' % len(embeddings_index))

        self.num_words = self.params['MAX_WORDS'] + 3
        self.embedding_matrix = np.zeros((self.num_words, self.params['EMBEDDING_DIM']))
        for word, i in self.word_index.items():
            if i >= self.params['MAX_WORDS'] + 3 and word not in [self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN]:
                continue
            embedding_vector = None
            if word == self.START_TOKEN:
                embedding_vector = self.START_TOKEN_VECTOR
            elif word == self.END_TOKEN:
                embedding_vector = self.END_TOKEN_VECTOR
            else:
                embedding_vector = embeddings_index.get(word)
            if embedding_vector is None:
                embedding_vector = self.UNK_TOKEN_VECTOR
            self.embedding_matrix[i] = embedding_vector
        np.save(self.BASIC_PERSISTENCE_DIR + '/word_index.npy', self.word_index)
        np.save(self.BASIC_PERSISTENCE_DIR + '/embedding_matrix.npy', self.embedding_matrix)
    def calculate_hiddenstate_after_encoder(self, sentence):
        self.word_index = np.load(self.BASIC_PERSISTENCE_DIR +
                                  '/word_index.npy')
        self.word_index = self.word_index.item()
        self.num_words = self.params['MAX_WORDS'] + 3
        tokenizer = Tokenizer(num_words=self.params['MAX_WORDS'] + 3)
        tokenizer.word_index = self.word_index
        tokenizer.num_words = self.num_words

        try:
            self.word_index[self.START_TOKEN]
            self.word_index[self.END_TOKEN]
            self.word_index[self.UNK_TOKEN]
        except Exception as e:
            print(e, "why")
            exit()

        sentence = tokenizer.texts_to_sequences([sentence])
        sentence = pad_sequences(sentence,
                                 maxlen=self.params['max_seq_length'],
                                 padding='post')

        self.embedding_matrix = np.load(self.BASIC_PERSISTENCE_DIR +
                                        '/embedding_matrix.npy')

        # Define an input sequence and process it.
        encoder_inputs = Input(shape=(None, ))
        encoder_embedding = Embedding(self.num_words,
                                      self.params['EMBEDDING_DIM'],
                                      weights=[self.embedding_matrix],
                                      mask_zero=True,
                                      trainable=False)
        encoder_embedded = encoder_embedding(encoder_inputs)
        encoder = LSTM(self.params['latent_dim'], return_state=True)
        encoder_outputs, state_h, state_c = encoder(encoder_embedded)
        # We discard `encoder_outputs` and only keep the states.
        encoder_states = [state_h, state_c]

        # Set up the decoder, using `encoder_states` as initial state.
        decoder_inputs = Input(shape=(None, ))
        decoder_embedding = Embedding(self.num_words,
                                      self.params['EMBEDDING_DIM'],
                                      weights=[self.embedding_matrix],
                                      mask_zero=True,
                                      trainable=False)
        decoder_embedded = decoder_embedding(decoder_inputs)
        # We set up our decoder to return full output sequences,
        # and to return internal states as well. We don't use the
        # return states in the training model, but we will use them in inference.
        decoder_lstm = LSTM(self.params['latent_dim'],
                            return_sequences=True,
                            return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_embedded,
                                             initial_state=encoder_states)
        decoder_dense = Dense(self.num_words, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)

        # Define the model that will turn
        # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
        model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

        # Run training
        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        model.summary()

        model.load_weights(self.LATEST_MODEL_CHKPT)

        # Define sampling our_implementation.models
        self.encoder_model = Model(encoder_inputs, encoder_states)

        return self.encoder_model.predict(sentence)
Пример #16
0
    pd_final_temporary.append(pd_ask_comm)
# Get the final pandas data frame with all the data needed
pd_final = pd.concat(pd_final_temporary)
pd_depr_sub, pd_depr_comm, pd_ask_sub, pd_ask_comm = None, None, None, None
print("Step 3: Completed.\n")

# **********************
# *** SPLIT THE DATA ***
# **********************
text = pd_final['text'].values
label = pd_final['label'].values

print("Step 3.x: Cleaning the data of stopwords to limit the data.")

tokenizer = Tokenizer(num_words=maxVocabulary)
tokenizer.num_words = 5000
tokenizer.fit_on_texts(text)

print("Step 4: Splitting the data for training and testing")
sentences_train, sentences_test, y_train, y_test = train_test_split(
    text,
    label,
    test_size=testSize,
    shuffle=True,
    random_state=rand_state_splitter)
print("Step 4: Completed.\n")

print("Step: 5: Text to sequence process for the training data.")
x_train = tokenizer.texts_to_sequences(sentences_train)
print("Step: 5: Text to sequence process for the testing data.")
x_test = tokenizer.texts_to_sequences(sentences_test)