# load pretrain glove word2vec instance for preprocessing filename = './data/glove.6B.300d.txt' print('Indexing Glove 6B 300D word vectors.') embeddings_index = {} with open(filename, encoding="utf-8") as f: for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs print('Found %s word vectors.' % len(embeddings_index)) print('Vectorizing input text') # vectorize the input text (both negative and positive ) tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) tokenizer.fit_on_texts(x_text) sequences = tokenizer.texts_to_sequences(x_text) word_index = tokenizer.word_index print(len(word_index)) data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) labels = to_categorical(np.asarray(y)) # split the data into a training set and a validation set X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=VALIDATION_SPLIT) print('Preparing embedding matrix') # prepare embedding matrix num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
line = line.replace(")", "") line = line.replace("/", "") line = line.replace("\\", "") line = line.replace("&", "") line = line.replace("#", "") line = re.sub('\d', '', line) line = line.split(' ') line = [w for w in line if not w in stop_words] line = str(line) line = str(line.strip())[1:-1].replace(' ', ' ') strings.append(line) #encode text as numbers tok_Len = 100000 # max number of words for tokenizer tokenizer = Tokenizer(num_words=tok_Len) tokenizer.fit_on_texts(strings) sequences = tokenizer.texts_to_sequences(strings) term_Index = tokenizer.word_index print('Number of Terms:', len(term_Index)) sen_Len = 162 # max length of each sentences, including padding tok_Features = pad_sequences(sequences, padding='post', maxlen=sen_Len - 111) print('Shape of tokenized features tensor:', tok_Features.shape) indices = np.arange(tok_Features.shape[0]) np.random.shuffle(indices) time_series = df['created_at_retweets'] time_series.reset_index(drop=True, inplace=True) print(type(time_series)) time_series = time_series[indices] tok_Features = tok_Features[indices]
import seaborn as sns # Lading the cleaned data csv file path = "../input/cleaned-data-for-nlp-news-classification/cleaned_data.csv" data = pd.read_csv(path) # Tokenizing vocab_size = 10000 embedding_dim = 32 max_length = 150 trunc_type = 'post' oov_tok = '<OOV>' padding_post = 'post' tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok, split=' ') tokenizer.fit_on_texts(data['text'].values) X = tokenizer.texts_to_sequences(data['text'].values) X = pad_sequences(X, padding=padding_post, maxlen=max_length, truncating=trunc_type) # Building Model keras.backend.clear_session() model = tf.keras.Sequential([ keras.layers.Embedding(vocab_size, embedding_dim, input_length=X.shape[1], input_shape=[None]), keras.layers.Bidirectional(
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """<a href="https://colab.research.google.com/github/lmoroney/dlaicourse/blob/master/TensorFlow%20In%20Practice/Course%203%20-%20NLP/Course%203%20-%20Week%201%20-%20Lesson%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> ##### Copyright 2019 The TensorFlow Authors. """ #@title Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from tensorflow.keras.preprocessing.text import Tokenizer sentences = ['i love my dog', 'I, love my cat', 'You love my dog!'] tokenizer = Tokenizer(num_words=100) # only count the top 100 frequent words tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print(word_index)
training_labels.append(l.numpy()) for s, l in test: testing_sentences.append(s.numpy().decode('utf8')) testing_labels.append(l.numpy()) vocab_size = 10000 oov = 'OOV' truncate = 'post' maxlen = 300 embeding_dim = 10 output = [] tokenizer = Tokenizer(oov_token=oov, num_words=vocab_size) tokenizer.fit_on_texts(training_sentences) train_seq = tokenizer.texts_to_sequences(training_sentences) train_pad = pad_sequences(train_seq, maxlen=maxlen, truncating=truncate) test_seq = tokenizer.texts_to_sequences(testing_sentences) test_pad = pad_sequences(test_seq, maxlen=maxlen, truncating=truncate) def check_sentences(token_s): return ' '.join([tokenizer.index_word.get(i) for i in token_s]) print(np.array(train_seq[0]).reshape(-1,)) print(check_sentences(train_seq[0])) model = tf.keras.Sequential([ tf.keras.layers.Embedding(vocab_size, embeding_dim, input_length=maxlen), tf.keras.layers.Flatten(), tf.keras.layers.Dense(6, activation='relu'),
train_dir = r"D:\data\csv_file\amazon_len_renew\amazon_1000_renew.csv" test_dir = r"D:\data\csv_file\amazon_len_renew\amazon_test.csv" glove_100_dir = "D:/data/glove.6B/glove.6B.100d.txt" original_train_df = pd.read_csv(train_dir) original_test_df = pd.read_csv(test_dir) original_test_df, original_val_df = train_test_split(original_test_df, test_size=0.4, random_state=0) x = original_train_df['review'] y = original_train_df['label'] t = Tokenizer() t.fit_on_texts(x) vocab_size = len(t.word_index) + 1 sequences = t.texts_to_sequences(x) def max_text(): for i in range(1, len(sequences)): max_length = len(sequences[0]) if len(sequences[i]) > max_length: max_length = len(sequences[i]) return max_length text_num = max_text() maxlen = text_num
def get_training_data(intents_file_path): try: with open('token_data.pickle', 'rb') as f: training_set, training_labels, word_count, max_sequence_len = pickle.load( f) return training_set, training_labels, word_count, max_sequence_len except: with open(intents_file_path) as f: data = json.load(f) #Parse the data sentences = [] labels = [] sentences_y = [] for intent in data["intents"]: for pattern in intent["patterns"]: sentences.append(pattern) sentences_y.append( intent["tag"] ) #So we have a tag associated with the pattern if intent["tag"] not in labels: labels.append(intent["tag"]) #Create tokenizer tokenizer = Tokenizer(oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_count = len(tokenizer.word_index) + 1 #print(tokenizer.word_index) #Tokenize and pad sequences = tokenizer.texts_to_sequences(sentences) max_sequence_len = max([len(x) for x in sequences]) padded_sequences = pad_sequences(sequences, maxlen=max_sequence_len, truncating='post') #Label tokenizer label_tokenizer = Tokenizer() label_tokenizer.fit_on_texts(labels) tok_labels = np.array(label_tokenizer.texts_to_sequences(sentences_y)) training_labels = np.zeros((padded_sequences.shape[0], len(labels))) for i in range(padded_sequences.shape[0]): training_labels[i][tok_labels[i] - 1] = 1 with open('token_data.pickle', 'wb') as f: pickle.dump((padded_sequences, training_labels, word_count, max_sequence_len), f) with open('raw_data.pickle', 'wb') as f: pickle.dump((sentences, labels, sentences_y), f) return padded_sequences, training_labels, word_count, max_sequence_len
texts_false = f.readlines() texts_false[0] = texts_false[0].replace('\ufeff', '') texts = texts_true + texts_false count_true = len(texts_true) count_false = len(texts_false) total_lines = count_true + count_false print(count_true, count_false, total_lines) maxWordsCount = 100000 tokenizer = Tokenizer(num_words=maxWordsCount, filters='!–"—#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\r«»', lower=True, split=' ', char_level=False) tokenizer.fit_on_texts(texts) max_text_len = 30 data = tokenizer.texts_to_sequences(texts) data_pad = pad_sequences(data, maxlen=max_text_len) print(data_pad.shape) X = data_pad Y = np.array([[1, 0]] * count_true + [[0, 1]] * count_false) print(X.shape, Y.shape) indeces = np.random.choice(X.shape[0], size=X.shape[0], replace=False) X = X[indeces] Y = Y[indeces] with open('dataset/test.txt', 'r', encoding='utf-8') as f:
for tagged_sentence in tagged_sentences: # 14,041개의 문장 샘플을 1개씩 불러온다. sentence, tag_info = zip( *tagged_sentence) # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장. sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다. ner_tags.append(list(tag_info)) # 각 샘플에서 개체명 태깅 정보만 저장한다. print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences)) print('샘플의 평균 길이 : %f' % (sum(map(len, sentences)) / len(sentences))) plt.hist([len(s) for s in sentences], bins=50) plt.xlabel('length of samples') plt.ylabel('number of samples') plt.show() max_words = 10000 src_tokenizer = Tokenizer() src_tokenizer.fit_on_texts(sentences) tar_tokenizer = Tokenizer() tar_tokenizer.fit_on_texts(ner_tags) vocab_size = max_words tag_size = len(tar_tokenizer.word_index) + 1 print('단어 집합의 크기 : {}'.format(vocab_size)) print('개체명 태깅 정보 집합의 크기 : {}'.format(tag_size)) X_train = src_tokenizer.texts_to_sequences(sentences) y_train = tar_tokenizer.texts_to_sequences(ner_tags) index_to_word = src_tokenizer.index_word index_to_ner = tar_tokenizer.index_word
inplace=True) df[colname] = df[colname].str.lower() #convert to lower case return df df_train = prepro('training') df_train['Class'].value_counts().plot(kind="bar", rot=0) df_train['TEXT'] = df_train['TEXT'].apply(lambda x: ' '.join([ lemmatizer.lemmatize(word) for word in set(x.split()) if word not in estopwords ])) MAX_VOCABS = 5000 tokenizer = Tokenizer(num_words=MAX_VOCABS) tokenizer.fit_on_texts(pd.concat([df_train['TEXT']])) x_train = tokenizer.texts_to_sequences(df_train['TEXT']) MAX_LEN = max([len(i) for i in x_train]) vocab_size = MAX_VOCABS + 1 x_train = pad_sequences(x_train, padding='post', maxlen=MAX_LEN, value=vocab_size) # convert integers to dummy variables (i.e. one hot encoded) y_train = pd.get_dummies(df_train['Class']).values dummy_columns = pd.get_dummies(df_train['Class']).columns dummy_columns = dummy_columns.tolist x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
class BugModelClient: oov_token = '<OOV>' vocab_size = None embedding_dim = 50 training_portion = 0.8 max_length = 100 num_epochs = 8 dropout = 0.2 class_weight = {0 : 1 , 1 : 2} data_path = 'datasets/training_dataset_pairs.csv' tokenizer_path = 'models/tokenizer.pickle' custom_glove_path = 'datasets/custom_glove_50d.txt' data = None training_size = None word_index = None tokenizer = None embedding_matrix = None bug_model = BugModel() def init_data(self, data_count): self.data = pd.read_csv(self.data_path, sep=',') self.data = self.data[:data_count] print(len(self.data.index)) self.data['clean_description_1'] = self.clean_descriptions(self.data['description_1']) self.data['clean_description_2'] = self.clean_descriptions(self.data['description_2']) self.training_size = int(len(self.data.index) * self.training_portion) X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(self.data['clean_description_1'], self.data['clean_description_2'], self.data['duplicates'], test_size=0.2) self.tokenizer = Tokenizer(oov_token=self.oov_token) self.tokenizer.fit_on_texts(X1_train) self.tokenizer.fit_on_texts(X2_train) self.word_index = self.tokenizer.word_index print(len(self.word_index)) self.vocab_size = len(self.word_index) + 1 X1_train = np.array(text_to_padded(X1_train, self.tokenizer, self.max_length)) X1_test = np.array(text_to_padded(X1_test, self.tokenizer, self.max_length)) X2_train = np.array(text_to_padded(X2_train, self.tokenizer, self.max_length)) X2_test = np.array(text_to_padded(X2_test, self.tokenizer, self.max_length)) self.X1_train = X1_train self.X1_test = X1_test self.X2_train = X2_train self.X2_test = X2_test self.y_train = y_train self.y_test = y_test def prepare_embedding(self): embeddings_index = dict() f = open(self.custom_glove_path, encoding='utf8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Loaded %s word vectors.' % len(embeddings_index)) embeddings_matrix = np.zeros((self.vocab_size, self.embedding_dim)) for word, i in self.tokenizer.word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: embeddings_matrix[i] = embedding_vector self.embedding_matrix = embeddings_matrix def save_tokenizer(self): with open(self.tokenizer_path, 'wb') as handle: pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) def load_tokenizer(self): with open(self.tokenizer_path, 'rb') as handle: self.tokenizer = pickle.load(handle) self.word_index = self.tokenizer.word_index self.vocab_size = len(self.word_index) + 1 print('Loaded tokenizer with %s words.' % self.vocab_size) def clean_descriptions(self, descriptions): clean_descriptions = descriptions.apply(lambda x: clean_text(x)) return clean_descriptions def train_model(self): self.bug_model.construct_model(self.vocab_size, self.embedding_dim, self.max_length, self.dropout, self.embedding_matrix) self.bug_model.fit_model([self.X1_train, self.X2_train], self.y_train, [self.X1_test, self.X2_test], self.y_test, self.num_epochs, self.class_weight) def plot_graphs(self): self.bug_model.plot_graphs() def save_model(self): self.bug_model.save_model() self.save_tokenizer() def load_model(self): self.bug_model.load_model() self.load_tokenizer() def predict(self, descriptions1, descriptions2): descriptions1 = np.array(text_to_padded(self.clean_descriptions(descriptions1), self.tokenizer, self.max_length)) descriptions2 = np.array(text_to_padded(self.clean_descriptions(descriptions2), self.tokenizer, self.max_length)) return self.bug_model.predict([descriptions1, descriptions2]) def validate_predict_top_k(self, descriptions, labels, master_labels, all_descriptions, all_labels, all_master_labels, k): descriptions = np.array(text_to_padded(self.clean_descriptions(descriptions), self.tokenizer, self.max_length)) all_descriptions = np.array(text_to_padded(self.clean_descriptions(all_descriptions), self.tokenizer, self.max_length)) print(labels) all_predictions = [] for index, description in enumerate(descriptions): print(index) description_repeated = np.full((len(all_descriptions), self.max_length), description) predictions = self.bug_model.predict([description_repeated, all_descriptions]) predictions = np.array([prediction[0] for prediction in predictions]) predictions_top_indices = (-predictions).argsort() prediction_summary = [] top_k_master_labels = [] for pred_index in predictions_top_indices: if len(top_k_master_labels) >= k: break if all_master_labels[pred_index] not in top_k_master_labels: top_k_master_labels.append(all_master_labels[pred_index]) prediction_summary.append({'case_id': all_labels[pred_index], 'master_id': all_master_labels[pred_index], 'probability': predictions[pred_index]}) did_predict = master_labels[index] in top_k_master_labels if master_labels[index] != labels[index] else master_labels[index] not in top_k_master_labels for n, pred_index in enumerate(predictions_top_indices): if all_master_labels[pred_index] == master_labels[index]: print('Correct target for {} with id {} in position {} with probability of {}'.format(labels[index], all_labels[pred_index], n, predictions[pred_index])) all_predictions.append({ 'case_id': labels[index], 'master_id': master_labels[index], 'predictions': prediction_summary, 'correct': did_predict }) return {'predictions': all_predictions, 'recall': len([prediction for prediction in all_predictions if prediction['correct'] == True]) / len(all_predictions)}
# X_train = train_data['tokenized'].values #import h5py loaded_model = load_model('snhs_rnn.h5') # with h5py.File('snhs_rnn_dr02.h5', mode='r') as f: # # instantiate model # model_config = f.attrs.get('model_config') # print(model_config) data_excel = pd.read_excel('201126_이노션샘플데이터.xlsx') print(data_excel.head(5)) #data_excel.columns = ['Text'] #print(data_excel.head(5)) print(len(data_excel['reviews'])) x_save_load = np.load('X_save2.npy', allow_pickle=True) tokenizer = Tokenizer(vocab_size, oov_token="OOV") tokenizer.fit_on_texts(x_save_load) f = open('result.txt', 'w', encoding='utf8') f2 = open('result3.txt', 'w', encoding='utf8') f.write('감성(test)\n') f2.write('감성(test)\n') def sentiment_predict(new_sentence): new_sentence = preprocword(new_sentence) #print(new_sentence) #print([new_sentence]) encoded = tokenizer.texts_to_sequences([new_sentence]) # 정수 인코딩 #print(encoded) pad_new = pad_sequences(encoded, maxlen=max_len, truncating='post') # 패딩
train_y = xdf['label'] #Creating the labels array sentences = xdf['tweet'] from tensorflow.keras.preprocessing.text import Tokenizer #To tokenize the text from tensorflow.keras.preprocessing.sequence import pad_sequences #To pad uneven length sequences #Hyper parameters num_words = 10000 pad_type = 'post' oov_token = "<OOV>" #Out of vocabulary token embedding_dim = 16 max_length = 250 tokenizer = Tokenizer(num_words = num_words, oov_token = oov_token) #Creating a tokenizer object tokenizer.fit_on_texts(sentences) #Using the method fit_on_texts() to tokenize the text feature word_index = tokenizer.word_index #Displaying the word-index con sequences = tokenizer.texts_to_sequences(sentences) #Using the texts_to_sequences() method to convert the tokens into sequences padded_sequences = pad_sequences(sequences, maxlen = max_length, padding = 'post', truncating = 'post') #Padding the sequences. Padding type is "post". pdf = pd.DataFrame(padded_sequences) from sklearn.model_selection import train_test_split train_x, test_x, train_y, test_y = train_test_split(pdf, train_y, test_size = 0.1, random_state = 10, shuffle = True) #Splitting the dataset into train_data and test_data
# # print('sysevr embedding') # print(sysevr_emb_dict['const']) # Tokenize corpus ast_tokenizer = Tokenizer() # cg_tokenizer = Tokenizer() # bcg_tokenizer = Tokenizer() # fcg_tokenizer = Tokenizer() # sysevr_tokenizer = Tokenizer() print("tokenizing asts") # Fit tokenizers ast_tokenizer.fit_on_texts(ast_data) # print("tokenizing cgs") # bcg_tokenizer.fit_on_texts(back_slices_data) # fcg_tokenizer.fit_on_texts(forward_slices_data) # # print("tokenizing sysevr") # sysevr_tokenizer.fit_on_texts(sysevr_data) ################################################# print("creating ast sequence") ast_sequences = my_preprocessing.ast_sequence(ast_data) # print("creating cg sequence") # bcg_sequences = bcg_tokenizer.texts_to_sequences(back_slices_data) # fcg_sequences = fcg_tokenizer.texts_to_sequences(forward_slices_data)
class PreProcessor: def __init__(self, sentences, ner_tags, val_sentences, val_ner_tags, oov_token: str = "<OOV>"): self._sentences = sentences self._ner_tags = ner_tags self._val_sentences = val_sentences self._val_ner_tags = val_ner_tags self._input_sequences = None self._label_sequences = None self._val_input_sequences = None self._val_label_sequences = None self._tokenizer = Tokenizer(oov_token=oov_token) self._label_tokenizer = Tokenizer() self._max_sequence_length = None def pre_process_data(self): self._pre_process_train_input_sequences() self._pre_process_train_label_sequences() self._pre_process_validation_input_sequences() self._pre_process_validation_label_sequences() def _pre_process_train_input_sequences(self): self._input_sequences = self._pre_process_input_sequence( self._sentences) def _pre_process_train_label_sequences(self): self._label_sequences = self._pre_process_label_sequences( self._ner_tags) def _pre_process_validation_input_sequences(self): self._val_input_sequences = self._pre_process_input_sequence( self._val_sentences, validation=True) def _pre_process_validation_label_sequences(self): self._val_label_sequences = self._pre_process_label_sequences( self._val_ner_tags, validation=True) def _pre_process_input_sequence(self, sentences: list, validation=False) -> np.ndarray: if not validation: self._tokenizer.fit_on_texts(sentences) self._compute_max_sequence_len(sentences) input_sequences = self._tokenizer.texts_to_sequences(sentences) padded_input_sequences = pad_sequences( input_sequences, padding='post', maxlen=self._max_sequence_length) return np.array(padded_input_sequences) def _pre_process_label_sequences(self, ner_tags: list, validation: bool = False) -> np.ndarray: if not validation: self._label_tokenizer.fit_on_texts(ner_tags) label_sequences = self._label_tokenizer.texts_to_sequences(ner_tags) padded_label_sequences = pad_sequences( label_sequences, padding='post', maxlen=self._max_sequence_length) return np.array(padded_label_sequences) def _compute_max_sequence_len(self, input_sequences: list): seq_lengths = [len(seq) for seq in input_sequences] self._max_sequence_length = max(seq_lengths) @property def input_sequences(self): return self._input_sequences @property def label_sequences(self): return self._label_sequences @property def val_input_sequences(self): return self._val_input_sequences @property def val_label_sequences(self): return self._val_label_sequences @property def num_unique_word_tokens(self): return len(self._tokenizer.word_index) + 1 # +1 for padding token @property def num_unique_label_tokens(self): return len(self._label_tokenizer.word_index) @property def max_sequence_length(self): return self._max_sequence_length @property def label_index_to_words_dict(self): return self._label_tokenizer.index_word
def preprocess_news_data(filename): print('Preprocessing news...') all_texts = [] category_map = {} titles = [] abstracts = [] categories = [] with open(filename, 'r') as f: for l in f: id, category, subcategory, title, abstract, url, entity = l.strip( '\n').split('\t') title = title.lower() # print(word_tokenize(title)) abstract = abstract.lower() # all_texts.append(word_tokenize(title)) # all_texts.append(word_tokenize(abstract)) all_texts.append(title + ". " + abstract) # map every category to a number if subcategory not in category_map: category_map[subcategory] = len(category_map) # map every subcategory to a number titles.append(title) abstracts.append(abstract) categories.append(subcategory) tokenizer = Tokenizer() tokenizer.fit_on_texts(all_texts) word_index = tokenizer.word_index # a dict: word_index[word]=index print('Found %s unique tokens.' % len(word_index)) print('Found %s unique categories.' % len(category_map)) # print(word_index) # title news_title = np.zeros((len(titles), MAX_TITLE_LENGTH), dtype='int32') for i, title in enumerate(titles): wordTokens = text_to_word_sequence(title) k = 0 for _, word in enumerate(wordTokens): if k < MAX_TITLE_LENGTH: news_title[i, k] = word_index[word] k = k + 1 # abstract news_abstract = np.zeros((len(abstracts), MAX_ABSTRACT_LENGTH), dtype='int32') for i, abstract in enumerate(abstracts): wordTokens = text_to_word_sequence(abstract) k = 0 for _, word in enumerate(wordTokens): if k < MAX_ABSTRACT_LENGTH: news_abstract[i, k] = word_index[word] k = k + 1 # category & subcategory news_category = [] k = 0 for category in categories: news_category.append(category_map[category]) k += 1 news_category = to_categorical(np.asarray(news_category)) return word_index, category_map, news_category, news_abstract, news_title
for row in reader: labels.append(row[0]) sentence = row[1] for word in stopwords: token = " " + word + " " sentence = sentence.replace(token, " ") sentence = sentence.replace(" ", " ") sentences.append(sentence) print(len(sentences)) print(sentences[0]) tokenizer = Tokenizer(oov_token="<OOV>") tokenizer.fit_on_texts(sentences) word_index = tokenizer.word_index print(len(word_index)) # Expected output # 29714 sequences = tokenizer.texts_to_sequences(sentences) padded = pad_sequences(sequences, padding='post') print(padded[0]) print(padded.shape)
class GatedRecurrentUnit(object): def __init__(self, max_tokens=5000, embedding_size=8, num_words=10000, model=None, tokenizer=None): self.x_train = [] self.y_train = [] self.x_train_tokens = [] self.list_label = [] self.model = model self.num_words = num_words self.max_tokens = max_tokens self.embedding_size = embedding_size self.tokenizer = tokenizer self.summary = True self.verbose = 1 self.epoch = 5 self.validation_split = 0.1 def one_hot_encoder(self, y): self.list_label = list(set(y)) label = np.zeros([len(y), len(self.list_label)]) for i in range(len(y)): label[i][self.list_label.index(y[i])] = 1 return label def model_gru(self): self.model = Sequential() self.model.add( Embedding(input_dim=self.num_words, output_dim=self.embedding_size, input_length=self.max_tokens, name='Embedding_Layer')) self.model.add(GRU(units=4)) self.model.add( Dense(len(self.list_label), activation='softmax', name='Output_layer')) self.model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) if self.summary: print(self.model.summary()) self.model.fit(self.x_train_tokens, self.y_train, epochs=self.epoch, validation_split=self.validation_split, verbose=self.verbose) def text_to_seq(self, x): temp = self.tokenizer.texts_to_sequences([x]) return pad_sequences(temp, maxlen=self.max_tokens, padding='pre', truncating='pre') def fit(self, x_train, y_train, epoch=5, validation_split=0.1, verbose=1): self.x_train = x_train self.y_train = y_train self.epoch = epoch self.validation_split = validation_split self.verbose = verbose self.tokenizer = Tokenizer(num_words=self.num_words) self.tokenizer.fit_on_texts(self.x_train) self.x_train_tokens = self.tokenizer.texts_to_sequences(self.x_train) self.x_train_tokens = pad_sequences(self.x_train_tokens, maxlen=self.max_tokens, padding='pre', truncating='pre') # if type(self.y_train[0]) == str or type(self.y_train[0]) == int: self.y_train = np.array(self.one_hot_encoder(y_train)) self.model_gru() def save_model(self, filename='model'): model_json = self.model.to_json() with open( os.path.join(os.getcwd(), 'Model/Output_model/{}.json'.format(filename)), 'w') as json_file: json_file.write(model_json) self.model.save_weights( os.path.join(os.getcwd(), 'Model/Output_model/{}.h5'.format(filename))) joblib.dump( self.tokenizer, os.path.join( os.getcwd(), 'Model/Output_model/{}_tokenizer.joblib'.format(filename))) joblib.dump( self.list_label, os.path.join( os.getcwd(), 'Model/Output_model/{}_list_label.joblib'.format(filename))) joblib.dump( self.max_tokens, os.path.join( os.getcwd(), 'Model/Output_model/{}_max_tokens.joblib'.format(filename))) def load_model(self, filename='model'): json_file = open( os.path.join(os.getcwd(), 'Model/Output_model/{}.json'.format(filename)), 'r') loaded_model_json = json_file.read() json_file.close() self.model = model_from_json(loaded_model_json) self.model.load_weights( os.path.join(os.getcwd(), 'Model/Output_model/{}.h5'.format(filename))) self.tokenizer = joblib.load( os.path.join( os.getcwd(), 'Model/Output_model/{}_tokenizer.joblib'.format(filename))) self.list_label = joblib.load( os.path.join( os.getcwd(), 'Model/Output_model/{}_list_label.joblib'.format(filename))) self.max_tokens = joblib.load( os.path.join( os.getcwd(), 'Model/Output_model/{}_max_tokens.joblib'.format(filename))) self.model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001), metrics=['accuracy']) def predict(self, x): return self.model.predict(x) def predict_classes(self, x): return self.model.predict_classes(x) def score(self, x_test, y_test): return self.model.evaluate(x_test, y_test)
def create_tokenizer(): tokenizer = Tokenizer() tokenizer.fit_on_texts(MidiParser.vocabulary().keys()) return tokenizer
class Decomp_tokenizer(object): def __init__(self): self.Tokenizer_args =Tokenizer(num_words=100,split=' ') self.Tokenizer_instr =Tokenizer(num_words=100,split='\n') self.label_mapping = {} def fit_instr(self, data): #fdata = random.sample(data, 50) self.Tokenizer_instr.fit_on_texts(data) def fit_args(self, data): data = self.no_newline(data) self.Tokenizer_args.fit_on_texts(data) def fit_label(self, data): mapping = {} unique = list(set(data)) for index, i in enumerate(unique): a = [0]*len(unique) a[index] = 1 mapping[i] = a print("SAVE THIS PLEASE") print(mapping) print("_________________________________________") self.label_mapping = mapping def tokenizeLabels(self, all): outs = [] for i in all: outs.append(self.label_mapping.get(i)) return outs def tokenize_labels_to_file(self, filename, data): outs = [] ddict = {} for i in data: outs.append([self.label_mapping.get(i)]) ddict["data"] = outs with open(filename+".json", "+w") as f: f.write(json.dumps(ddict)) def read_data_from_file(self,filename): with open(filename+".json", 'r') as f: jdata = json.load(f) keys = jdata['data'] print("[*Reading in {}.json*]".format(filename)) return keys def tokenize_data_to_file(self, filename, data): tokens = self.Tokenizer.texts_to_sequences(tqdm(data)) data = { } print("writing to dict") data["data"] = tokens print("writing dataset into file: {}.json".format(filename)) with open(filename+".json", '+w') as f: f.write(json.dumps(data)) def tokenize_args(self,data): data = self.no_newline(data) tokens = self.Tokenizer_args.texts_to_sequences(tqdm(data)) return tokens def tokenize_instr(self,data): tokens = self.Tokenizer_instr.texts_to_sequences(tqdm(data)) return tokens def no_newline(self,data): fin = [] for i in data: fin.append(i.replace("\n", " ")) return fin def __str__(self): return str(self.Tokenizer.word_index) def save_status(self,): with open("Tokenizer_args_data.json", '+w') as f: f.write(json.dumps(self.Tokenizer_args.to_json())) with open("Toekenizer_instr_data.json",'+w') as f: f.write(json.dumps(self.Tokenizer_instr.to_json())) with open("Label_data.json", '+w') as f: f.write(json.dumps(self.label_mapping)) def recover_status(self): with open("Label_data.json", 'r') as f: self.label_mapping = json.load(f) with open("Tokenizer_data.json", 'r') as f: self.Tokenizer = keras.preprocessing.text.tokenizer_from_json(json.load(f))
trunc_type = 'post' padding_type = 'post' oov_tok = '<OOV>' complaints = dataset["phrase"].values labels = dataset[["prompt"]].values X_train, y_train, X_test, y_test = train_test_split(complaints, labels, test_size=0.20, random_state=42) print(X_train.shape, X_test.shape) print(y_train.shape, y_test.shape) tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>') tokenizer.fit_on_texts(X_train) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) dict(list(word_index.items())[0:10]) train_seq = tokenizer.texts_to_sequences(X_train) train_padded = pad_sequences(train_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type) validation_seq = tokenizer.texts_to_sequences(y_train) validation_padded = pad_sequences(validation_seq, maxlen=max_length, padding=padding_type, truncating=trunc_type)
x.append(text) y.append(int(record['is_sarcastic'])) return x, y # DO NOT CHANGE THIS CODE OR THE TESTS MAY NOT WORK vocab_size = 1000 max_length = 120 trunc_type = 'post' padding_type = 'post' oov_tok = "<OOV>" training_size = 20000 sentences = [] labels = [] x, y = fetch_data() tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) tokenizer.fit_on_texts(x) x_seq = tokenizer.texts_to_sequences(x) x_seq = pad_sequences(x_seq, maxlen=max_length, padding='post', truncating='post') random_sel = random.sample(range(200, len(x_seq)), 2000) x_test = x_seq[random_sel] y_test = np.array(y)[random_sel] model = models.load_model('sarcasm.h5') print('testing model..') print(model.evaluate(x_test, y_test))
class DDTokenizer: def __init__(self, num_words, oov_token='<UNK>'): self.tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token, filters='!"#$%&*+,-./:;<>?\\^_`{|}~\t\n', char_level=True, lower=False) self.has_trained = False self.pad_type = 'post' self.trunc_type = 'post' # The encoded data self.word_index = {} def fit(self, train_data): # Get max training sequence length print("Training Tokenizer...") self.tokenizer.fit_on_texts(train_data) self.has_trained = True print("Done training...") # Get our training data word index self.word_index = self.tokenizer.word_index def encode(self, data, use_padding=True, padding_size=None, normalize=False): # Encode training data sentences into sequences train_sequences = self.tokenizer.texts_to_sequences(data) # Get max training sequence length if there is none passed if padding_size is None: maxlen = max([len(x) for x in train_sequences]) else: maxlen = padding_size if use_padding: train_sequences = pad_sequences(train_sequences, padding=self.pad_type, truncating=self.trunc_type, maxlen=maxlen) if normalize: train_sequences = np.multiply(1 / len(self.tokenizer.word_index), train_sequences) return train_sequences def pad(self, data, padding_size=None): # Get max training sequence length if there is none passed if padding_size is None: padding_size = max([len(x) for x in data]) padded_sequence = pad_sequences(data, padding=self.pad_type, truncating=self.trunc_type, maxlen=padding_size) return padded_sequence def decode(self, array): assert self.has_trained, "Train this tokenizer before decoding a string." return self.tokenizer.sequences_to_texts(array) def test(self, string): encoded = list(self.encode(string)[0]) decoded = self.decode(self.encode(string)) print("\nEncoding:") print("{original} -> {encoded}".format(original=string[0], encoded=encoded)) print("\nDecoding:") print("{original} -> {encoded}".format(original=encoded, encoded=decoded[0].replace( " ", ""))) def get_info(self): return self.tokenizer.index_word
from tensorflow.keras.utils import to_categorical import numpy as np from tensorflow.keras.preprocessing.sequence import pad_sequences """ 문장마다 길이가 다르므로 패딩을 해서 일관된 문장 길이로 만들어주는 모듈 신경망에 배치처리를 하려면 일관된 사이즈로 문장을 만들어줘야 하기 때문에 필요 """ from tensorflow.keras.preprocessing.text import Tokenizer # text를 숫자로 변환하는 모듈 text = """경마장에 있는 말이 뛰고 있다\n 그의 말이 법이다\n 가는 말이 고와야 오는 말이 곱다""" t = Tokenizer() t.fit_on_texts([text]) # text 문자를 가지고 corpus, word_to_id, id_to_word를 생성 sequences = list() for line in text.split('\n'): encoded = t.texts_to_sequences([line])[0] for i in range(1, len(encoded)): sequence = encoded[:i + 1] sequences.append(sequence) max_len = max(len(l) for l in sequences) from tensorflow.keras.preprocessing.sequence import pad_sequences sequences = pad_sequences(sequences, maxlen=6, padding='pre') from tensorflow.keras.preprocessing.sequence import pad_sequences import numpy as np sequences = np.array(pad_sequences(sequences, maxlen=6, padding='pre')) X = sequences[:, :-1] y = sequences[:, -1]
from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.models import Sequential from tensorflow.keras.optimizers import Adam from tensorflow.keras import regularizers import tensorflow.keras.utils as ku import numpy as np import matplotlib.pyplot as plt tokenizer = Tokenizer() data = open('poems.txt', encoding="utf8").read() corpus = data.lower().split("\n") tokenizer.fit_on_texts(corpus) total_words = len(tokenizer.word_index) + 1 input_sequences = [] for line in corpus: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) # pad sequences max_sequence_len = max([len(x) for x in input_sequences]) input_sequences = np.array( pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) # create predictors and label predictors, label = input_sequences[:, :-1], input_sequences[:, -1] label = ku.to_categorical(label, num_classes=total_words) model = Sequential()
return " ".join(tokens) #학습 데이터 전처리 진행 dataset.text = dataset.text.apply(lambda x: preprocess(x)) """학습 데이터 나누기""" train, test = train_test_split(dataset, test_size=1-TRAIN_SIZE, random_state=42) print("TRAIN size:", len(train)) print("TEST size:", len(test)) documents = [_text.split() for _text in train.text] #list, 1280000*50 vocab_size = 400000 tk = Tokenizer(num_words=vocab_size) tk.fit_on_texts(train.text) x_train = tk.texts_to_sequences(train.text) x_test = tk.texts_to_sequences(test.text) labels = train.target.unique().tolist() #POSITIVE NEUTRAL NEGATIVE labels.append(NEUTRAL) print(labels) encoder = LabelEncoder() #문장 -> 숫자 자동으로 encoder.fit(train.target.tolist()) y_train = encoder.transform(train.target.tolist()) y_test = encoder.transform(test.target.tolist()) y_train = y_train.reshape(-1,1) #1열로 자동으로 만들어줍니다. y_test = y_test.reshape(-1,1)
all = x + xtest len(all) y = df.iloc[:,1].values y.shape ytest =df.iloc[:,1].values ytest.shape ytrain = pd.DataFrame(y) tokenizer = Tokenizer(nb_words=10000, split=' ') tokenizer.fit_on_texts(all) Xs = tokenizer.texts_to_sequences(all) Xs = pad_sequences(Xs,maxlen=20,padding='post',truncating='post') Ys = pd.get_dummies(ytrain).values Xtrain = Xs[:16000] Xtest = Xs[16000:] Ytrain = Ys Ytest = pd.get_dummies(ytest).values
def train(): #Pre-processing df = pd.read_csv(DATA_PATH) df.drop('Unnamed: 0', axis=1, inplace=True) df.dropna(axis=0, inplace=True) # df.head(10) df['Party'] = pd.Categorical(df.Party) df['Party'] = pd.get_dummies(df['Party'], drop_first=True) X = df['Tweet'] Y = df['Party'] x, y = pre_process(X, Y=Y) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=420) #Creating a tokenizer t = Tokenizer(oov_token="UNK") t.fit_on_texts(x) vocab_size = len(t.word_index) + 1 print("Vocabulary size: {}".format(vocab_size)) max_sent_len = len(max(x, key=len).split()) + 1 print("Maximum sentence length: {}".format(max_sent_len)) emb_dim = 75 print("Embedding Dimensions: {}".format(emb_dim)) padded_X_train = encode_and_pad(X_train, t, max_sent_len) x_train, x_val, Y_train, y_val = train_test_split(padded_X_train, y_train, test_size=0.1, random_state=420) checkpoint_path = TRAINING_PATH cp_callback = ModelCheckpoint(checkpoint_path, verbose=1, save_weights_only=True, period=20) model = Sequential([ Embedding(input_dim=vocab_size, output_dim=emb_dim, input_length=max_sent_len, trainable=True), Bidirectional(CuDNNLSTM(64, return_sequences=False)), Dropout(0.5), Dense(2, activation='softmax') ]) model.save_weights(checkpoint_path.format(epoch=0)) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, Y_train, epochs=200, batch_size=300, shuffle=True, callbacks=[cp_callback], validation_data=(x_val, y_val)) model.save_weights(WEIGHTS_PATH) dic = { 'Vocab Size': vocab_size, 'Max Sent Length': max_sent_len, 'Emb Dim': emb_dim } meta_df = pd.DataFrame(dic, index=['Model 1']) meta_df.to_csv(META_PATH) with open(TOKENIZER_PATH, 'wb') as handle: pickle.dump(t, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Training done.")
if "all_descriptions_test.zarr" in os.listdir('data') and "embedding_test_128.zarr" in os.listdir('data'): with tf.device('/cpu:0'): embedding_test = da.from_zarr("data/embedding_test_128.zarr") desc_test = da.from_zarr("data/all_descriptions_test.zarr") else: print("Embedding and Descriptions dask array haven't been saved for testing, please run text_preprocessing.py") exit() allText = [] for desc_array in [desc_train.compute(), desc_train.compute(), desc_validation.compute()]: for descs in desc_array: for desc in descs: allText.append(desc) tokenizer = Tokenizer() tokenizer.fit_on_texts(allText) with open('tokenizer.pickle', 'wb') as handle: pkl.dump(tokenizer, handle, protocol=pkl.HIGHEST_PROTOCOL) print("Saved tokenizer file....") vocab_size = len(tokenizer.word_index) + 1 max_length = max(len(d.split()) for d in allText) def count_length(tokenizer, descriptions): try: with tf.device('/gpu:0'): Y = 0 for i in tqdm(range(len(descriptions))): for j in range(len(descriptions[i])): seq = tokenizer.texts_to_sequences([(descriptions[i][j].compute()).tolist()])[0] Y+=len(seq)-1
print("size of dictionary: {0}".format(len(embdict))) del (words) # In[ ]: MAX_NB_WORDS = 50000 MAX_SEQUENCE_LENGTH = 250 EMBEDDING_DIM = 300 # In[ ]: tokenizer = Tokenizer() tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True) tokenizer.fit_on_texts(texts_train + texts_test + texts_ev) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) # In[ ]: embedding_matrix = np.zeros((len(word_index), EMBEDDING_DIM)) for word, i in tokenizer.word_index.items(): try: embedding_vector = embdict[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector except: pass del (embdict)