def predict_one_sentence(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence], search_related_word=True) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) prediction = self.M.predict(sentence) predicted_sentence = "" reverse_word_index = dict( (i, word) for word, i in self.de_word_index.items()) for sentence in prediction: for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print( "second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sentence += next_word + " " return predicted_sentence
def calculate_hiddenstate_after_encoder(self, sentence): self.__setup_model() tokenizer = Tokenizer() self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() tokenizer.word_index = self.word_index self.num_words = self.params['MAX_WORDS'] + 3 tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() sentence = tokenizer.texts_to_sequences([sentence]) sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]] sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) encoder_name = 'encoder' encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output) prediction = encoder.predict(sentence, batch_size=1) print(prediction.shape) return prediction
def predict_batch(self, sentences): self.__setup_model() tokenizer = Tokenizer() self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() tokenizer.word_index = self.word_index self.num_words = self.params['MAX_WORDS'] + 3 tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() sentences = tokenizer.texts_to_sequences(sentences) mod_sentences = [] for sentence in sentences: mod_sentences.append([self.word_index[self.START_TOKEN]] + sentence + [self.word_index[self.END_TOKEN]]) sentences = pad_sequences(mod_sentences, maxlen=self.params['max_seq_length'], padding='post') sentences = sentences.reshape(sentences.shape[0], sentences.shape[1]) batch_size = sentences.shape[0] if batch_size > 10: batch_size = 10 reverse_word_index = dict((i, word) for word, i in self.word_index.items()) predicted_sentences = [] from_idx = 0 to_idx = batch_size while True: print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0]) current_batch = sentences[from_idx:to_idx] prediction = self.M.predict(current_batch, batch_size=batch_size) for sentence in prediction: predicted_sent = "" for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sent += next_word + " " predicted_sentences.append(predicted_sent) from_idx += batch_size to_idx += batch_size if to_idx > sentences.shape[0]: # todo nicht multiple von batchsize trotzdem predicten break return predicted_sentences
def tokenizer(cls, texts): tokenizer = Tokenizer(filters=DELIMITERS, lower=True) tokenizer.fit_on_texts(texts) corpus_vocabulary_size = len(tokenizer.word_index) # If our vocabulary size exceeds the maximum allowed vocab size we # need to limit it to a smaller number. Otherwise we just use it as # `num_words` for our tokenize tokenizer.num_words = min(cls.MAX_VOCAB_SIZE, corpus_vocabulary_size) return tokenizer
def calculate_hiddenstate_after_encoder(self, sentence): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentence) sentence = en_tokenizer.texts_to_sequences([sentence]) print(sentence) sentence = pad_sequences(sentence, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) encoder_name = 'encoder' encoder = Model(inputs=self.M.input, outputs=self.M.get_layer(encoder_name).output) prediction = encoder.predict(sentence, batch_size=1) print(prediction.shape) return prediction
def predict_one_sentence(self, sentence): self.__setup_model() tokenizer = Tokenizer() self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() tokenizer.word_index = self.word_index self.num_words = self.params['MAX_WORDS'] + 3 tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() print(sentence) sentence = tokenizer.texts_to_sequences([sentence]) print(sentence) sentence = [self.word_index[self.START_TOKEN]] + sentence[0] + [self.word_index[self.END_TOKEN]] print(sentence) sentence = pad_sequences([sentence], maxlen=self.params['max_seq_length'], padding='post') print(sentence) sentence = sentence.reshape(sentence.shape[0], sentence.shape[1]) print(sentence) prediction = self.M.predict(sentence) predicted_sentence = "" reverse_word_index = dict((i, word) for word, i in self.word_index.items()) for sentence in prediction: for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print("second best prediction is ", reverse_word_index[np.argmax(np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sentence += next_word + " " return predicted_sentence
def Get_Tokens(Train_Data): DataTokenizer = None TokensOk = False if os.path.exists('Tokens.pkl'): with open('Tokens.pkl', 'rb') as Handle: DataTokenizer = pickle.load(Handle) TokensOk = True if not TokensOk: #Los tokens los calculamos al principio una vez con el Train Dataset print("Tokenizer...") input("Confirma generar nuevos token 1?") input("Confirma generar nuevos token 2?") input("Confirma generar nuevos token 3?") DataTokenizer = Tokenizer(char_level=False, num_words=50000) print("Fitting tokenizer...") DataTokenizer.fit_on_texts(Train_Data.title.values) DataTokenizer.num_words = 50000 with open('Tokens.pkl', 'wb') as handle: pickle.dump(DataTokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) return DataTokenizer
dataY.append(entry[0]) return dataX, dataY dataset = load_data() encoder = LabelEncoder() encoder.fit(dataset['type'].values) train=dataset.sample(frac=0.8,random_state=200) test=dataset.drop(train.index) tokenizer = Tokenizer(oov_token='<UNK>') tokenizer.fit_on_texts(train['posts'].values) tokenizer.num_words = 400 train['Y'] = encoder.transform(train['type'].values) test['Y'] = encoder.transform(test['type'].values) train_x, train_y = zip(*train[['posts', 'Y']].values) test_x, test_y =zip(*test[['posts','Y']].values) test_x = np.array(test_x) test_y = np.array(test_y) print(test_x) print(test_y) print("=======================>") train_x = np.array(train_x) train_y = np.array(train_y)
def predict_batch(self, sentences): self.__setup_model() self.en_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/en_word_index.npy') self.de_word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/de_word_index.npy') en_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_EN']) en_tokenizer.word_index = self.en_word_index en_tokenizer.num_words = self.params['MAX_WORDS_EN'] + 3 de_tokenizer = Tokenizer(self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN, num_words=self.params['MAX_WORDS_DE']) de_tokenizer.word_index = self.de_word_index de_tokenizer.num_words = self.params['MAX_WORDS_DE'] + 3 print(sentences) sentences = en_tokenizer.texts_to_sequences(sentences) print(sentences) sentences = pad_sequences(sentences, maxlen=self.params['MAX_SEQ_LEN'], padding='post', truncating='post') sentences = sentences.reshape(sentences.shape[0], sentences.shape[1]) batch_size = sentences.shape[0] if batch_size > 10: batch_size = 10 reverse_word_index = dict( (i, word) for word, i in self.de_word_index.items()) predicted_sentences = [] from_idx = 0 to_idx = batch_size while True: print("from_idx, to_idx, hm_sentences", from_idx, to_idx, sentences.shape[0]) current_batch = sentences[from_idx:to_idx] prediction = self.M.predict(current_batch, batch_size=batch_size) for sentence in prediction: predicted_sent = "" for token in sentence: max_idx = np.argmax(token) if max_idx == 0: print("id of max token = 0") print( "second best prediction is ", reverse_word_index[np.argmax( np.delete(token, max_idx))]) else: next_word = reverse_word_index[max_idx] if next_word == self.END_TOKEN: break elif next_word == self.START_TOKEN: continue predicted_sent += next_word + " " predicted_sentences.append(predicted_sent) from_idx += batch_size to_idx += batch_size if to_idx > sentences.shape[0]: # todo accept not multiple of batchsize break return predicted_sentences
if __name__ == '__main__': import tensorflow as tf import re import pickle from data_utils.constants import ALL_TEXTS from keras.preprocessing.text import Tokenizer with open(ALL_TEXTS, 'r') as file: word_tokenizer = Tokenizer( filters='\t\n', lower=True, oov_token='<UNK>' ) lines = file.readlines() word_tokenizer.fit_on_texts(lines) word_tokenizer.num_words = len( [x for x in word_tokenizer.word_counts.values() if x >= 5]) + 1 word_tokenizer.word_index = dict( (k, v) for k, v in word_tokenizer.word_index.items() if v < word_tokenizer.num_words) word_tokenizer.word_index[word_tokenizer.oov_token] = word_tokenizer.num_words print(word_tokenizer.word_index) print('Word tokenizer num words:', word_tokenizer.num_words) print(word_tokenizer.texts_to_sequences(['asdf efas Huflit'])) char_tokenizer = Tokenizer( filters='\t\n', lower=True, char_level=True, oov_token='<UNK>' ) char_tokenizer.fit_on_texts(re.sub(r'\s', '', line) for line in lines) with open('./output/word_tokenizer.pkl', 'wb') as file: pickle.dump(word_tokenizer, file, pickle.HIGHEST_PROTOCOL) with open('./output/char_tokenizer.pkl', 'wb') as file: pickle.dump(char_tokenizer, file, pickle.HIGHEST_PROTOCOL)
#%% """one hot""" rev2['negative']=rev2['sentiment'] rev2['negative'].replace([1,0], [0,1], inplace = True) rev2=rev2.rename(index=str, columns={"sentiment": "positive"}) """stop word""" nltk.download('stopwords') stop = stopwords.words("english") rev2['review'] = rev2['review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) review_train, review_test, label_train, label_test = train_test_split(rev2['review'],rev2.loc[:,['positive','negative']], test_size=0.2,random_state=13,stratify=rev['sentiment']) tokenizer = Tokenizer() tokenizer.fit_on_texts(review_train) tokenizer.num_words=2000 X_train = tokenizer.texts_to_sequences(review_train) X_test = tokenizer.texts_to_sequences(review_test) vocab_size = len(tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index maxlen = 15 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) X_train = tokenizer.sequences_to_texts(X_train) X_test = tokenizer.sequences_to_texts(X_test) tfidf_vectorizer = TfidfVectorizer()
train = pd.read_csv('./data/toxic_comments/train.csv') sentences = train["comment_text"].fillna("DUMMY_VALUE").values possible_labels = [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] targets = train[possible_labels].values print("Max sentence length:", max(len(s) for s in sentences)) print("Min sentence length:", min(len(s) for s in sentences)) s = sorted(len(s) for s in sentences) print("Median sequence length:", s[len(s) // 2]) tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, oov_token='UNKOWN') tokenizer.fit_on_texts(sentences) tokenizer.num_words = MAX_VOCAB_SIZE sequences = tokenizer.texts_to_sequences(sentences) # print ("sequences : ", sequences); exit() word2idx = tokenizer.word_index print("Number of unique words:%s" % len(word2idx)) # sequences should be padded data = pad_sequences( sequences, maxlen=MAX_SEQUENCE_LENGTH ) # 1. Pre-padding is default 2. Tokenizer indexes start from 1 and 0 is reserved for padding # IMPORTANT: Default for padding and truncating is 'pre', because the last words are more important when we are going to take into account the prediction after the last word print("Shaoe of data tensor: ", data.shape) print("Filling pre-trained embeddings...") num_words = min(
def get_tfidf(tokens, top_n_words): t = Tokenizer() t.fit_on_texts(tokens) t.num_words=top_n_words+1 tfidf = t.texts_to_matrix(tokens, 'tfidf') return tfidf
def _split_count_data(self): self.input_texts = [] self.target_texts = [] lines = open(self.data_path, encoding='UTF-8').read().split('\n') for line in lines[: min(self.params['num_samples'], len(lines) - 1)]: input_text, target_text = line.split('\t') self.input_texts.append(input_text) target_text = target_text self.target_texts.append(target_text) self.num_samples = len(self.input_texts) tokenizer = Tokenizer(num_words=self.params['MAX_WORDS']) tokenizer.fit_on_texts(self.input_texts + self.target_texts) self.word_index = tokenizer.word_index for word in tokenizer.word_index: tokenizer.word_index[word] = tokenizer.word_index[word] + 3 tokenizer.word_index[self.START_TOKEN] = 1 tokenizer.word_index[self.END_TOKEN] = 2 tokenizer.word_index[self.UNK_TOKEN] = 3 tokenizer.num_words = tokenizer.num_words + 3 self.word_index = tokenizer.word_index try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() self.input_texts = tokenizer.texts_to_sequences(self.input_texts) self.target_texts = tokenizer.texts_to_sequences(self.target_texts) for idx in range(len(self.target_texts)): self.input_texts[idx] = [self.word_index[self.START_TOKEN]] + self.input_texts[idx] + [ self.word_index[self.END_TOKEN]] self.target_texts[idx] = [self.word_index[self.START_TOKEN]] + self.target_texts[idx] + [ self.word_index[self.END_TOKEN]] if self.target_texts[idx][0] != 1: print(idx) print(self.target_texts[idx]) exit(-1) self.input_texts = pad_sequences(self.input_texts, maxlen=self.params['max_seq_length'], padding='post') self.target_texts = pad_sequences(self.target_texts, maxlen=self.params['max_seq_length'], padding='post') embeddings_index = {} filename = self.PRETRAINED_GLOVE_FILE with open(filename, 'r', encoding='utf8') as f: for line in f.readlines(): values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs print('Found %s word vectors.' % len(embeddings_index)) self.num_words = self.params['MAX_WORDS'] + 3 self.embedding_matrix = np.zeros((self.num_words, self.params['EMBEDDING_DIM'])) for word, i in self.word_index.items(): if i >= self.params['MAX_WORDS'] + 3 and word not in [self.START_TOKEN, self.END_TOKEN, self.UNK_TOKEN]: continue embedding_vector = None if word == self.START_TOKEN: embedding_vector = self.START_TOKEN_VECTOR elif word == self.END_TOKEN: embedding_vector = self.END_TOKEN_VECTOR else: embedding_vector = embeddings_index.get(word) if embedding_vector is None: embedding_vector = self.UNK_TOKEN_VECTOR self.embedding_matrix[i] = embedding_vector np.save(self.BASIC_PERSISTENCE_DIR + '/word_index.npy', self.word_index) np.save(self.BASIC_PERSISTENCE_DIR + '/embedding_matrix.npy', self.embedding_matrix)
def calculate_hiddenstate_after_encoder(self, sentence): self.word_index = np.load(self.BASIC_PERSISTENCE_DIR + '/word_index.npy') self.word_index = self.word_index.item() self.num_words = self.params['MAX_WORDS'] + 3 tokenizer = Tokenizer(num_words=self.params['MAX_WORDS'] + 3) tokenizer.word_index = self.word_index tokenizer.num_words = self.num_words try: self.word_index[self.START_TOKEN] self.word_index[self.END_TOKEN] self.word_index[self.UNK_TOKEN] except Exception as e: print(e, "why") exit() sentence = tokenizer.texts_to_sequences([sentence]) sentence = pad_sequences(sentence, maxlen=self.params['max_seq_length'], padding='post') self.embedding_matrix = np.load(self.BASIC_PERSISTENCE_DIR + '/embedding_matrix.npy') # Define an input sequence and process it. encoder_inputs = Input(shape=(None, )) encoder_embedding = Embedding(self.num_words, self.params['EMBEDDING_DIM'], weights=[self.embedding_matrix], mask_zero=True, trainable=False) encoder_embedded = encoder_embedding(encoder_inputs) encoder = LSTM(self.params['latent_dim'], return_state=True) encoder_outputs, state_h, state_c = encoder(encoder_embedded) # We discard `encoder_outputs` and only keep the states. encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = Input(shape=(None, )) decoder_embedding = Embedding(self.num_words, self.params['EMBEDDING_DIM'], weights=[self.embedding_matrix], mask_zero=True, trainable=False) decoder_embedded = decoder_embedding(decoder_inputs) # We set up our decoder to return full output sequences, # and to return internal states as well. We don't use the # return states in the training model, but we will use them in inference. decoder_lstm = LSTM(self.params['latent_dim'], return_sequences=True, return_state=True) decoder_outputs, _, _ = decoder_lstm(decoder_embedded, initial_state=encoder_states) decoder_dense = Dense(self.num_words, activation='softmax') decoder_outputs = decoder_dense(decoder_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_inputs, decoder_inputs], decoder_outputs) # Run training model.compile(optimizer='rmsprop', loss='categorical_crossentropy') model.summary() model.load_weights(self.LATEST_MODEL_CHKPT) # Define sampling our_implementation.models self.encoder_model = Model(encoder_inputs, encoder_states) return self.encoder_model.predict(sentence)
pd_final_temporary.append(pd_ask_comm) # Get the final pandas data frame with all the data needed pd_final = pd.concat(pd_final_temporary) pd_depr_sub, pd_depr_comm, pd_ask_sub, pd_ask_comm = None, None, None, None print("Step 3: Completed.\n") # ********************** # *** SPLIT THE DATA *** # ********************** text = pd_final['text'].values label = pd_final['label'].values print("Step 3.x: Cleaning the data of stopwords to limit the data.") tokenizer = Tokenizer(num_words=maxVocabulary) tokenizer.num_words = 5000 tokenizer.fit_on_texts(text) print("Step 4: Splitting the data for training and testing") sentences_train, sentences_test, y_train, y_test = train_test_split( text, label, test_size=testSize, shuffle=True, random_state=rand_state_splitter) print("Step 4: Completed.\n") print("Step: 5: Text to sequence process for the training data.") x_train = tokenizer.texts_to_sequences(sentences_train) print("Step: 5: Text to sequence process for the testing data.") x_test = tokenizer.texts_to_sequences(sentences_test)