class OneHot(object): def __init__(self, vocab, char_level=False, max_len=100): self.char_level = char_level self.max_len = max_len self.vocab = vocab self.tokenizer = Tokenizer(num_words=self.max_len, char_level=self.char_level) self.tokenizer.fit_on_texts(self.vocab) def encode(self, s): s_int = self.tokenizer.texts_to_sequences(s) s_int = [[x[0] - 1] for x in s_int] s_oh = to_categorical(s_int) return s_oh def decode(self, arr): s_int = np.argmax(arr, axis=1) s_int = [[x + 1] for x in s_int] s_list = self.tokenizer.sequences_to_texts(s_int) if self.char_level: sep = '' else: sep = ' ' return sep.join(s_list)
def embed_flair(texts, max_length=100, max_words=1000): tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) texts = tokenizer.sequences_to_texts(sequences) sentence_embeddings = [] padding = np.zeros(embedding_features) count = 0 step = 3 max = len(texts) for text in texts: sentence_embedding = [] paddings = [] sentence = Sentence(text) embeddings_flair.embed(sentence) for token in sentence: sentence_embedding.append(token.embedding.cpu().numpy()) for i in range(max_length - len(sentence_embedding)): paddings.append(padding) if len(paddings) > 0: sentence_embedding = np.concatenate([paddings, sentence_embedding], axis=0) else: sentence_embedding = np.array(sentence_embedding[:max_length]) count += 1 if (100 * count / max > step): print(str(step) + '%') step += 3 sentence_embeddings.append(sentence_embedding) return np.array(sentence_embeddings)
def _train_model(self, model: Word2Vec, texts): tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) texts_seq = tokenizer.sequences_to_texts( tokenizer.texts_to_sequences(texts)) texts_seq = [f.split(" ") for f in texts_seq] print("Adding to word2vec vocabulary...") model.min_count = 2 model.build_vocab(texts_seq, update=True) print("Training word2vec ...") model.train(texts_seq, total_examples=len(texts_seq), epochs=model.epochs)
def prepareData(self, List): allText = "" for episode in List: allText = allText + " " + self.clean(episode) allTextTokenized = word_tokenize(allText) tokenizer = Tokenizer(filters="") tokenizer.fit_on_texts(allTextTokenized) # turn words into sequences s = tokenizer.texts_to_sequences(allTextTokenized) self.word_idx = tokenizer.word_index self.idx_word = tokenizer.index_word self.num_words = len(self.word_idx) + 1 t = tokenizer.sequences_to_texts(s) w = [''.join(i) for i in t] wseq = [] for a in w: if a != '': wseq.append(self.word_idx[a]) trainingdata = [] a = 0 while (a + 60) < len(wseq): x = [] for i in range(a, a + 60): x.append(wseq[i]) y = wseq[a + 60] temp = [x, y] trainingdata.append(temp) a = a + 1 print("number of training pairs", len(trainingdata)) print('number of words :', self.num_words) self.dataX = [] self.dataY = [] for e in trainingdata: self.dataX.append(e[0]) self.dataY.append(e[1]) # reshape X to be [samples, time steps, features] self.X = np.reshape(self.dataX, (len(trainingdata), 60, 1)) # normalize self.X = self.X / float(self.num_words) # one hot encode the output variable self.y = np_utils.to_categorical(self.dataY)
class OneHot: def __init__(self, docs_tokens, max_doc_len): self.tokenizer = Tokenizer(split=None, lower=False, filters=None, oov_token=True) self.tokenizer.fit_on_texts(docs_tokens) unknown_index = self.tokenizer.word_index.get('<<UNKNOWN>>', None) if unknown_index is not None: self.tokenizer.word_index.pop('<<UNKNOWN>>', None) self.tokenizer.word_index.pop(True, None) self.tokenizer.word_index.update({'<<UNKNOWN>>': 1}) if unknown_index is not None: for word, id in self.tokenizer.word_index.items(): if id > unknown_index: self.tokenizer.word_index[word] = id - 1 self.tokenizer.oov_token = '<<UNKNOWN>>' self.max_doc_len = max_doc_len self.max_word_num = self.tokenizer.num_words def get_sequence(self, tokens): return pad_sequences(self.tokenizer.texts_to_sequences([tokens]), maxlen=self.max_doc_len)[0] def get_docs_sequences(self, docs_tokens): return pad_sequences(self.tokenizer.texts_to_sequences(docs_tokens), maxlen=self.max_doc_len) def get_tokens(self, sequence): return self.tokenizer.sequences_to_texts([sequence])[0] def get_docs_tokens(self, docs_sequences): return self.tokenizer.sequences_to_texts(docs_sequences) def get_word_indexes(self): return self.tokenizer.word_index
class DataHelper(object): def __init__(self, max_len=100, max_num_words=100000): self.max_len = max_len self.max_num_words = max_num_words self.tokenizer = Tokenizer(num_words=max_num_words) self.label_columns = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] self.text_column = 'comment_text' def _get_examples(self, data_path, fit_tokenizer): data = pd.read_csv(data_path, encoding="utf-8") labels = data[self.label_columns].values comments = data[self.text_column].values if fit_tokenizer: self.tokenizer.fit_on_texts(comments) comments = self.tokenizer.texts_to_sequences(comments) comments = pad_sequences(comments, self.max_len) return comments, labels def sequences_to_texts(self, sequences): return self.tokenizer.sequences_to_texts(sequences) def get_data_loader(self, data_path, batch_size, shuffle=True, fit_tokenizer=True, device='cuda:0'): comments, labels = self._get_examples(data_path, fit_tokenizer) comments, labels = torch.as_tensor(comments).long(), torch.as_tensor( labels).float() comments = comments.to(device) labels = labels.to(device) dataset = TensorDataset(comments, labels) data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) return data_loader
class Model: def __init__(self, questions: List[str]): self.sliding_token_size = 15 self.path = None self.model = None self.tokens = re.sub(r'[^a-zA-Z\s]', r' \g<0> ', ' newline '.join(questions)).split() self.tokenizer = Tokenizer(filters='') self.tokenizer.fit_on_texts(self.tokens) self.x = numpy.zeros((len(self.tokens) - self.sliding_token_size, self.sliding_token_size)) self.y = numpy.zeros((len(self.tokens) - self.sliding_token_size, 1)) sequences = self.tokenizer.texts_to_sequences(self.tokens) for token_n in range(len(self.tokens) - self.sliding_token_size): for sliding_token_n in range(self.sliding_token_size): self.x[token_n][sliding_token_n] = sequences[ token_n + sliding_token_n][0] self.y[token_n] = sequences[token_n + self.sliding_token_size][0] def save(self): pickle.dump(self.model, open(self.path + 'model.bin', 'wb')) pickle.dump(self.tokens, open(self.path + 'tokens.bin', 'wb')) pickle.dump(self.tokenizer, open(self.path + 'tokenizer.bin', 'wb')) def load(self): self.model = pickle.load(open(self.path + 'model.bin', 'rb')) self.tokens = pickle.load(open(self.path + 'tokens.bin', 'rb')) self.tokenizer = pickle.load(open(self.path + 'tokenizer.bin', 'rb')) def generate(self, n): questions = random.choice(self.tokens) while questions.count(' newline ') <= n: sequences = pad_sequences( [self.tokenizer.texts_to_sequences([questions])[0]], self.sliding_token_size) questions += ' ' + self.tokenizer.sequences_to_texts( [self.model.predict_classes(sequences)])[0] return re.sub(r'(?<=[\d\`\'])\s(?=[\d\`\'])', '', questions.replace(' newline ', '\n').replace(' ?', '?')).split('\n')[1:-1]
# tokenize the sentences tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(train_text_vec) train_text_vec = tokenizer.texts_to_sequences(train_text_vec) test_text_vec = tokenizer.texts_to_sequences(test_text_vec) # pad the sequences train_text_vec = pad_sequences(train_text_vec, maxlen=MAX_SEQ_LEN) test_text_vec = pad_sequences(test_text_vec, maxlen=MAX_SEQ_LEN) print('Number of Tokens:', len(tokenizer.word_index)) print("Max Token Index:", train_text_vec.max(), "\n") print('Sample Tweet Before Processing:', train["text"].values[0]) print('Sample Tweet After Processing:', tokenizer.sequences_to_texts([train_text_vec[0]]), '\n') print('What the model will interpret:', train_text_vec[0].tolist()) # One Hot Encode Y values: encoder = LabelEncoder() y_train = encoder.fit_transform(train['sentiment'].values) y_train = to_categorical(y_train) y_test = encoder.fit_transform(test['sentiment'].values) y_test = to_categorical(y_test) # get an idea of the distribution of the text values from collections import Counter ctr = Counter(train['sentiment'].values)
# y_train.shape, y_test.shape: (8982,) (2246,) print("첫번째 훈련용 뉴스 기사: \n", x_train[0]) # 인덱스 숫자만 리스트 형태로 출력 print(" 첫번째 훈련용 뉴스 기사 레이블: \n", y_train[0]) # 인덱스만 출력 # x_train에 들어있는 숫자들이 각각 어떤 단어들을 나타내는지 확인 word_index = reuters.get_word_index() print("x데이터의 word_index: \n", word_index) # 딕셔너리 형태로 각 단어별 인덱스 출력 # 인덱스를 단어로 바꿔주기 from keras.preprocessing.text import Tokenizer token = Tokenizer() word_index = token.fit_on_texts(reuters.get_word_index()) word = token.sequences_to_texts(x_train[0:1]) print("x_train의 첫번째 word: \n", word) # x_train의 shape를 확인하고 싶다? # 하지만 리스트 형이라 shape가 없다 print(len(x_train[0])) # 87 # y의 카테고리 개수 출력 category = np.max(y_train) + 1 print("y데이터의 레이블 개수: ", category) # 46 # y의 유니크한 값들 출력 y_bunpo = np.unique(y_train) print("y데이터의 분포: \n", y_bunpo) # 0~45 # y데이터의 분포: # [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
def model_word2vec(suffix=""): for round in range(0, 2): rtest = xlrd.open_workbook(filename="切割" + suffix + "test.xls") rtrain = xlrd.open_workbook(filename="切割" + suffix + "train.xls") r_vocall1 = xlrd.open_workbook(filename="pre处理" + suffix + "test.xls") r_vocall2 = xlrd.open_workbook(filename="pre处理" + suffix + "train.xls") sheet_test = rtest.sheet_by_index(0) sheet_train = rtrain.sheet_by_index(0) sheet1_vocall = r_vocall1.sheet_by_index(0) sheet2_vocall = r_vocall2.sheet_by_index(0) invocal1 = sheet1_vocall.col_values(4) invocal2 = sheet2_vocall.col_values(4) for i in range(0, len(invocal1)): if len(invocal1[i]) == 0: invocall = invocal1[:i] print("1") break for i in range(0, len(invocal2)): if len(invocal2[i]) == 0: print("1") invocal2 = invocal2[:i] break for i in invocal2: if i not in invocall: invocall.append(i) print(len(invocall)) vocall_size = len(invocall) if round == 1: ex_tag = sheet_test.col_values(6) xtrain = sheet_train.col_values(2 + round * 3) ztrain = sheet_train.col_values(0 + round * 3) ytrain = sheet_train.col_values(1 + round * 3) xtest = sheet_test.col_values(2 + round * 3) ztest = sheet_test.col_values(0 + round * 3) ytest = sheet_test.col_values(1 + round * 3) for i in range(0, len(xtrain)): if len(xtrain[i]) == 0: xtrain = xtrain[:i] ztrain = ztrain[:i] ytrain = ytrain[:i] break for i in range(0, len(xtest)): if len(xtest[i]) == 0: xtest = xtest[:i] ytest = ytest[:i] ztest = ztest[:i] break print(round * 3) print(len(xtrain), "xtrain") print(len(ytrain), "ytrain") print(len(xtest), "xtest") print(len(ytest), "ytest") if round == 1: other = sheet_train.cell(0, 13).value other = int(other) print(other) if other == 1: xtrain = xtrain + sheet_train.col_values(9) ytrain = ytrain + sheet_train.col_values(8) ztrain = ztrain + sheet_train.col_values(7) tokenizer = Tokenizer(num_words=vocall_size) tokenizer.fit_on_texts(invocall) xtrain = tokenizer.texts_to_sequences(xtrain) xtest = tokenizer.texts_to_sequences(xtest) maxlen = 0 for i in xtrain: if len(i) > maxlen: maxlen = len(i) for i in xtest: if len(i) > maxlen: maxlen = len(i) print(maxlen, "maxlen") xtrain = pad_sequences(xtrain, padding='post', maxlen=maxlen) xtest = pad_sequences(xtest, padding='post', maxlen=maxlen) print(len(ytrain), len(xtrain)) print(len(ytest), len(xtest)) for i in range(0, len(ytrain)): ytrain[i] = int(ytrain[i]) for i in range(0, len(ytest)): ytest[i] = int(ytest[i]) modelw2v = gensim.models.Word2Vec.load("word2vec_150_lstm.model") embedding_matrix = np.zeros(shape=(vocall_size + 1, 150)) for word, i in invocall(): embedding_vector = modelw2v[word] if embedding_vector is not None: embedding_matrix[i] = embedding_vector embedding_size = 150 hidden_layer_size = 64 batch_size = 128 num_epochs = 3 model = Sequential() model.add( Embedding(vocall_size + 1, embedding_size, trainable=False, weights=[embedding_matrix], input_length=maxlen)) model.add(SpatialDropout1D(0.2)) model.add(Attention()) model.add(LSTM(hidden_layer_size, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(1)) model.add(Activation("sigmoid")) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.summary() history = model.fit(xtrain, ytrain, epochs=7 + round * 5, batch_size=64) loss, accuracy = model.evaluate(xtest, ytest) print(loss, accuracy) """ plt.subplot(211) plt.title("Accuracy"+suffix) plt.plot(history.history['acc'],color="g",label="Train") plt.legend(loc="best") plt.subplot(212) plt.title("Loss") plt.plot(history.history['loss'],color="g",label="Train") plt.legend(loc="best") plt.tight_layout() plt.show() """ w = xlwt.Workbook() sheet2 = w.add_sheet("准备文件", cell_overwrite_ok=True) sheet2.write(0, 0, "predict") sheet2.write(0, 1, "ytest") sheet2.write(0, 2, "xtest") sheet2.write(0, 3, "ex_tag") sheet2.write(0, 4, "loss") sheet2.write(1, 4, loss) sheet2.write(0, 5, "acc") sheet2.write(1, 5, accuracy) ypred = model.predict_classes(xtest, 1) xtest = tokenizer.sequences_to_texts(xtest) for index in range(0, len(ypred)): sheet2.write(1 + index, 0, int(ypred[index][0])) sheet2.write(1 + index, 1, ytest[index]) sheet2.write(1 + index, 2, xtest[index]) if round == 1: sheet2.write(1 + index, 3, ex_tag[index]) if round == 1: ac_sum = [] for i in range(0, len(ypred)): ac = 0 count = 1 ac = ac + int(ypred[i][0]) for i2 in range(i, len(ypred)): if i != len(ypred) - 1 and ex_tag[i] == ex_tag[i + 1]: i = i + 1 ac = ac + int(ypred[i][0]) count = count + 1 else: acca = ac / count if acca >= 0.5: ac_sum.append("1") else: ac_sum.append("0") break right = 0 refer_tag = sheet_test.col_values(1) for i in range(0, len(ac_sum)): if refer_tag[i] == ac_sum[i]: right += 1 acr = right / len(ac_sum) sheet2.write(0, 7, "acc") sheet2.write(1, 7, acr) if round == 0: w.save("result切割" + suffix + "w2v.xls") else: w.save("result扩充" + suffix + "w2v.xls")
model.fit(train_x, to_categorical(train_y), epochs=1, verbose=1, shuffle=True, validation_split=0.2) #evaluate the model on separate validation loss, accuracy = model.evaluate(valid_x, to_categorical(valid_y), verbose=1) print('Accuracy: %f' % (accuracy * 100), loss) ############################################################ ############################################################ ############################################################ ############################################################ ############################################################ #print the precision and recall numbers predicted = model.predict(valid_x) predicted = np.argmax(predicted, axis=1) #get the actual labels predicted_cats = cats.inverse_transform(predicted) valid_y_cats = cats.inverse_transform(valid_y) print(predicted_cats) print(classification_report(valid_y_cats, predicted_cats)) pred_df = pd.DataFrame() pred_df['valid_x__post_cleaned'] = t.sequences_to_texts(valid_x) pred_df['actual_label'] = valid_y_cats pred_df['predictions'] = predicted_cats pred_df.to_csv('predictions.csv') #print(pred_df) #if want to see the classes wrt to doc cleaned text input()
tokenizer.fit_on_texts(text) total_words = len(tokenizer.word_index) + 1 input_sequences = [] for line in text: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] input_sequences.append(n_gram_sequence) return input_sequences, total_words text_sequences, total_words = get_sequence_tokens(clean_lines[:1000]) print(text_sequences[:5]) print(tokenizer.sequences_to_texts(text_sequences[:5])) print(f"Length of sequences array {len(text_sequences)}") """As you can see from the example above, the sentence is converted into N number of words. Now while feeding data, we require the number of inputs to be constant. However every sentence will not have the same number of words in it. Hence, inorder to adjust for this varying length we pad the sentences (pre or post). The maximum sequence size is defined by the maximum length in the input_sequences. """ def generate_padded_sequences(input_sequences): max_sequence_length = max([len(x) for x in input_sequences])
#callbacks=[tensorboard, model_checkpoint], verbose=1) # get the loss and metrics result = model.evaluate(X_test, y_test) # extract those loss = result[0] accuracy = result[1] precision = result[2] recall = result[3] # Check for prediction result predVal = model.predict(X_test) # Convert question from number to text X_test_N = tokenizer.sequences_to_texts(X_test) # X_test_N = pd.DataFrame(X_test_N) Pred_One = pd.DataFrame(data=predVal, columns=["zero", "One"]) X_test_N["OneProb"] = Pred_One["One"] X_test_N["ZeroProb"] = Pred_One["zero"] #total spams predicted X_test_N[X_test_N["OneProb"] >= 0.50] X_test_N.shape
import unittest from keras.preprocessing.text import Tokenizer import numpy as np char_tokenizer = Tokenizer(char_level=True) text = 'The quick brown fox jumped over the lazy dog' char_tokenizer.fit_on_texts(text) seq = char_tokenizer.texts_to_sequences(text) print(seq) char_tokenizer.sequences_to_texts(seq) char_vectors = char_tokenizer.texts_to_matrix(text) print(char_vectors) print(char_vectors.shape) print(char_vectors[0]) np.argmax(char_vectors[0]) print(char_tokenizer.index_word) print(char_tokenizer.word_index) print(char_tokenizer.index_word[np.argmax(char_vectors[0])]) # ### Adding this method below just for test case and is not present in the exercise def get_one_hot_vector(text_, word_index): char_tokenizer.fit_on_texts(text_)
class NERSlotFiller(object): """NER Slot Classifier.""" def __init__(self, maxlen=50, vocab_size=10000): """Init.""" self.ner = None self.maxlen = maxlen self.vocab_size = vocab_size def fit(self, sentence_result, slot_result): """Fit model.""" self.tokenizer = Tokenizer(num_words=self.vocab_size, char_level=True, lower=False) self.tokenizer.fit_on_texts(sentence_result) seq = self.tokenizer.texts_to_sequences(sentence_result) seq_pad = pad_sequences(seq, maxlen=self.maxlen) self.tokenizer_y = Tokenizer(num_words=self.vocab_size, char_level=True, lower=False) self.tokenizer_y.fit_on_texts(slot_result) seq_y = self.tokenizer_y.texts_to_sequences(slot_result) seq_pad_y = pad_sequences(seq_y, maxlen=self.maxlen) self.ner = get_model(len(self.tokenizer_y.word_index) + 1) self.ner.model.fit(seq_pad, to_categorical(seq_pad_y), epochs=5) def predict_slot(self, nlu_obj): """Predict Slot.""" tokens = nlu_obj['tokens'] ret = self.predict([tokens]) ner_ret = get_slots_detail(tokens, ret[0][-len(tokens):]) nlu_obj['ner_slot_filler'] = {'slots': ner_ret} for slot in ner_ret: slot['from'] = 'ner_slot_filler' if len(nlu_obj['slots']) <= 0: nlu_obj['slots'] = ner_ret else: for slot in ner_ret: is_include = False for s in nlu_obj['slots']: if slot['pos'][0] >= s['pos'][0] and slot['pos'][0] <= s[ 'pos'][1]: is_include = True break elif slot['pos'][1] >= s['pos'][0] and slot['pos'][1] <= s[ 'pos'][1]: is_include = True break elif s['pos'][0] >= slot['pos'][0] and s['pos'][0] <= slot[ 'pos'][1]: is_include = True break elif s['pos'][1] >= slot['pos'][0] and s['pos'][1] <= slot[ 'pos'][1]: is_include = True break if not is_include: nlu_obj['slots'].append(slot) nlu_obj['slots'] = sorted(nlu_obj['slots'], key=lambda x: x['pos'][0]) return nlu_obj def predict(self, sentence_result): """Predict sentence.""" assert self.ner is not None, 'model not fitted' seq = self.tokenizer.texts_to_sequences(sentence_result) seq_pad = pad_sequences(seq, maxlen=self.maxlen) y_pred = self.ner.predict_proba(seq_pad).argmax(-1) y_pred = self.tokenizer_y.sequences_to_texts(y_pred) y_pred = tuple( [y.split(' ')[-len(s):] for s, y in zip(sentence_result, y_pred)]) return y_pred def eval(self, sentence_result, slot_result): """Evaluate.""" y_pred = self.predict(sentence_result) y_test = slot_result acc = 0 bad = [] for sent, real, pred in zip(sentence_result, y_test, y_pred): real_slot = get_slots(sent, real) pred_slot = get_slots(sent, pred) a = get_exact_right(real_slot, pred_slot) acc += a if not a: bad.append((sent, real, pred, real_slot, pred_slot)) acc /= len(sentence_result) return acc, bad
model.load_weights(Model_File) except: print("model error") model.fit(x_train, y_train, epochs=100, verbose=2) test = x_train[:1] result = [] i = 0 while True: char = model.predict(test) result.append([float(np.argmax(char))]) test = [np.append(test[:, 1:], np.argmax(char))] test = np.array(test) i += 1 if i > 50: break ttt = tokenizer.sequences_to_texts(result) print(ttt) else: model.fit(x_train, y_train, epochs=100, verbose=2) model.save_weights(Model_File) # for diversity in [1.0]: # generated = '' # sentence = ['문재인'] # for i in range(500): # preds = model.predict(x, verbose=0)[0] # next_index = sample(preds, diversity) # next_char = indices_char[next_index] # generated += next_char # sentence = sentence[1:] + next_char
class NeuralClassifier: """ """ def __init__(self): """Initializes a neural classifier's attributes """ # a list of tuples of (type, data_clean, true_label) self.labelled_data = [] self.labelled_validation_data = [] self.model = None self.tokenizer = None self.labels = [] self.label_encoder = None #force def pickle(self, fname, keep_data=False): """Pickles this classifier Parameters ---------- fname : a file name keep_data : if test/validation data should be kept (will increase size of file) """ with open(fname, 'w') as f: if keep_data: pickle.dump(self, f) else: temp_l_data = self.labelled_data temp_v_data = self.labelled_validation_data self.labelled_data = [] self.labelled_validation_data = [] pickle.dump(self, f) self.labelled_data = temp_l_data self.labelled_validation_data = temp_v_data def to_pred(self, pred): """ Parameters ---------- pred : array_like A real vector st len(pred) == len(self.labels) Returns ------- str The label string at the index of the first maximal value of pred """ maxi = 0 for i in range(1, len(pred)): if pred[i] > maxi: maxi = i return self.labels[maxi] def to_pred_comparison(self, pred): """ Parameters ---------- pred : array_like A real vector st len(pred) == len(self.labels) Returns ------- array_like An array of tuples of (labels, prediction_prob) for each value in pred, in descending order by probability """ probs = [(self.labels[i], pred[i]) for i in range(len(pred))] probs.sort(key=lambda x: x[1], reverse=True) return probs def add_data(self, file_id: str, data: str, true_label): """Adds the given data point to this model's data Parameters ---------- file_id : str an id for the the file this data point is drawn from data : str true_label The true label for this daa point """ # CURRENTLY NOT TAKING IN PRE-TOKENIZED FILE, DISCUSS WITH TEAM ABOUT ALTERING CLASSIFIER INTERFACE if true_label not in self.labels: self.labels.append(true_label) self.labelled_data.append((file_id, data, true_label)) def add_validation_data(self, file_id: str, data: str, true_label: int): """Adds the given data point to this model's validation data Parameters ---------- file_id : str an id for the the file this data point is drawn from data : str true_label The true label for this daa point """ if true_label not in self.labels: self.labels.append(true_label) self.labelled_validation_data.append((file_id, data, true_label)) def train(self, max_number_tokens=neural_constants.MAX_NUMBER_TOKENS, slice_length=neural_constants.SLICE_LENGTH, slice_overlap=neural_constants.SLICE_OVERLAP, glove_file=neural_constants.GLOVE_FILE, glove_dimensions=neural_constants.GLOVE_DIMENSIONS, diagnostic_printing=False, num_epochs=10, batch_size=5): """ Parameters ---------- max_number_tokens : int, optional The maximum number of distinct tokens allowed by the tokenizer. With more data, this value should increase slice_length : int, optional The length of the subslices sent that are sent through the model. With more data, this value should increase This value should probably not be greater than half the length of a typical document slice_overlap : float, optional The percent of each slice that is overlapped with its neigbors This value should be in the range [0,1), but probably not above .2 glove_file : str, optional The .txt file containing the glove embeddings to use for this classifier glove_dimensions : str, optional The number of dimensions of the given glove_file diagnostic_printing : bool, optional True to run output some statistics on all validation data num_epochs : int, optional The number of epochs to train the model for. Determined experimentally batch_size : int, optional The batch size to use when training the model Determined experimentally """ has_validation = len(self.labelled_validation_data) > 0 # create the tokenizer self.tokenizer = Tokenizer(num_words=max_number_tokens) training_data = [text for _, text, _ in self.labelled_data] self.tokenizer.fit_on_texts(training_data) self.label_encoder = LabelEncoder() self.label_encoder.fit(self.labels) self.label_encoder = LabelEncoder() self.label_encoder.fit(self.labels) # now build our training data_clean X_train = self.tokenizer.texts_to_sequences(training_data) if has_validation: X_validation = self.tokenizer.texts_to_sequences( [text for _, text, _ in self.labelled_validation_data]) X_train, y_train_labels = data_slicer.slice_data( X_train, [y for _, _, y in self.labelled_data], slice_length=slice_length, overlap_percent=slice_overlap) if has_validation: X_validation, y_validation_labels = data_slicer.slice_data( X_validation, [y for _, _, y in self.labelled_validation_data], slice_length=slice_length, overlap_percent=slice_overlap) # convert labels to 1-hots self.label_encoder = LabelEncoder() self.label_encoder.fit(self.labels) y_train = np_utils.to_categorical( self.label_encoder.transform(y_train_labels)) if has_validation: y_validation = np_utils.to_categorical( self.label_encoder.transform(y_validation_labels)) # pad them as necessary if has_validation: X_validation = np.array( pad_sequences(X_validation, padding="post", maxlen=slice_length)) X_train = pad_sequences(X_train, padding="post", maxlen=slice_length) # force change # get our glove embeddings glove = load_glove(glove_file, self.tokenizer.word_index, glove_dimensions) # compute some neural_constants vocab_size = len(self.tokenizer.word_index) + 1 # set model parameters self.model = Sequential() model_layers = [ # must have these two layers firsts layers.Embedding(vocab_size, glove_dimensions, weights=[glove], input_length=slice_length, trainable=False), # now we have some options # as more data becomes available, a more optimal sequence of inner layers # may be discoverable layers.GlobalMaxPool1D(), layers.Dense(45, activation="relu"), layers.Dense(20, activation="sigmoid"), # final layer for the output probability distribution layers.Dense(len(self.labels), activation="softmax") ] # add them in for layer in model_layers: self.model.add(layer) self.model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) """ print(np.shape(X_train)) print(np.shape(y_train)) print(np.shape(X_validation)) print(np.shape(y_validation)) """ #X_train, y_train = shuffle_parallel_arrays(X_train, y_train) # now we fit (can take a while) if has_validation: self.model.fit(X_train, y_train, epochs=num_epochs, verbose=False, shuffle=True, validation_data=(X_validation, y_validation), batch_size=batch_size) else: self.model.fit(X_train, y_train, epochs=num_epochs, verbose=False, shuffle=True, batch_size=batch_size) if diagnostic_printing and has_validation: def cm(true, pred): m = confusion_matrix(true, pred) print("Confusion matrix") print(" {0:3s} {1:3s}".format("P+", "P-")) print("T+ {0:<3d} {1:<3d}".format(m[1][1], m[0][1])) print("T- {0:<3d} {1:<3d}".format(m[1][0], m[0][0])) y_train_pred = [ x for x in list(self.model.predict(X_train, verbose=False)) ] y_validation_pred = [ x for x in list(self.model.predict(X_validation, verbose=False)) ] loss, acc = self.model.evaluate(X_train, y_train, verbose=False) print("Train L/A asd: {0:.4f} {1:.4f}".format(loss, acc)) # cm(y_train, y_train_pred) loss, acc = self.model.evaluate(X_validation, y_validation, verbose=False) print("Validation L/A: {0:.4f} {1:.4f}".format(loss, acc)) #cm(y_validation, y_validation_pred) nc = 0 for i in range(len(X_validation)): print(y_validation_labels[i], self.to_pred(y_validation_pred[i]), y_validation_pred[i]) if y_validation_labels[i] == self.to_pred( y_validation_pred[i]): nc += 1 print("acc:", nc / len(y_validation_labels)) def predict(self, str, slice_length=neural_constants.SLICE_LENGTH, slice_overlap=neural_constants.SLICE_OVERLAP): """ Parameters ---------- str : str a string of text to predict slice_length : int, optional the slice length to use. Should match the model's slice length slice_overlap : float, optional The percent of each slice that is overlapped with its neigbors This value should be in the range [0,1), but probably not above .2 Returns ------- distribution: array_like The probability distribution s.t. distribution[i] == P(label of str == self.labels[i]) Where len(distribution) == len(self.labels) And sum(distribution) == 1 And for all i distribution[i] >= 0 """ tokenized = self.tokenizer.texts_to_sequences([str]) slices, _ = data_slicer.slice_data(tokenized, None, slice_length=slice_length, overlap_percent=slice_overlap) #print(slices) X = np.array(pad_sequences(slices, padding="post", maxlen=slice_length)) #print(X) predictions = [x for x in list(self.model.predict(X, verbose=False))] s = predictions[0] for p in predictions[1:]: for i in range(len(s)): s[i] += p[i] return self.to_pred_comparison([x / sum(s) for x in s]) def slice_and_predict(self, str, slice_length=neural_constants.SLICE_LENGTH, slice_overlap=neural_constants.SLICE_OVERLAP): """Slices and predicts the input string for each slice Parameters ---------- str : str a string of text to predict slice_length : int, optional the slice length to use. Should match the model's slice length slice_overlap : float, optional The percent of each slice that is overlapped with its neigbors This value should be in the range [0,1), but probably not above .2 Returns ------- distribution: array_like The probability distribution s.t. distribution[i] == P(label of str == self.labels[i]) Where len(distribution) == len(self.labels) And sum(distribution) == 1 And for all i distribution[i] >= 0 """ tokenized = self.tokenizer.texts_to_sequences([str]) slices, _ = data_slicer.slice_data(tokenized, None, slice_length=slice_length, overlap_percent=slice_overlap) restored = self.tokenizer.sequences_to_texts(slices) #print(slices) X = np.array(pad_sequences(slices, padding="post", maxlen=slice_length)) #print(X) predictions = [x for x in list(self.model.predict(X, verbose=False))] return [(self.to_pred(predictions[i]), restored[i]) for i in range(len(slices))]
from keras.layers import Dense, Embedding, LSTM, Flatten, Conv1D, MaxPool1D, BatchNormalization, Dropout from keras.models import Sequential from keras.utils import to_categorical from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from keras.datasets import imdb import numpy as np import pandas as pd import matplotlib.pyplot as plt (x_train, y_train), (x_test, y_test) = imdb.load_data(maxlen=2000) words = imdb.get_word_index() # print(words) token = Tokenizer() token.fit_on_texts(words) a = token.sequences_to_texts(x_train) print(a) # print(x_train.shape, x_test.shape) # (8982,) (2246,) # print(y_train.shape, y_test.shape) # (8982,) (2246,) # print(x_train[0]) #(25000,) (24996,) # print(y_train[0]) #(25000,) (24996,) # # print(len(x_train[0])) #218 test = [] for x in x_train[0]: test.append(words) print(test) category = np.max(y_train) + 1 # print("카테고리 : ", category) # 카테고리 : 2
class ConceptTokenizer: unused_token = ['[UNUSED]'] mask_token = ['[MASK]'] def __init__(self, special_tokens: Optional[Sequence[str]] = None, oov_token='0'): self.special_tokens = special_tokens self.tokenizer = Tokenizer(oov_token=oov_token, filters='', lower=False) def fit_on_concept_sequences(self, concept_sequences): self.tokenizer.fit_on_texts(concept_sequences) self.tokenizer.fit_on_texts(self.mask_token) self.tokenizer.fit_on_texts(self.unused_token) if self.special_tokens is not None: self.tokenizer.fit_on_texts(self.special_tokens) def encode(self, concept_sequences): return self.tokenizer.texts_to_sequences(concept_sequences) def decode(self, concept_sequence_token_ids): return self.tokenizer.sequences_to_texts(concept_sequence_token_ids) def get_all_token_indexes(self): all_keys = set(self.tokenizer.index_word.keys()) if self.tokenizer.oov_token is not None: all_keys.remove( self.tokenizer.word_index[self.tokenizer.oov_token]) if self.special_tokens is not None: excluded = set([ self.tokenizer.word_index[special_token] for special_token in self.special_tokens ]) all_keys = all_keys - excluded return all_keys def get_first_token_index(self): return min(self.get_all_token_indexes()) def get_last_token_index(self): return max(self.get_all_token_indexes()) def get_vocab_size(self): # + 1 because oov_token takes the index 0 return len(self.tokenizer.index_word) + 1 def get_unused_token_id(self): unused_token_id = self.encode(self.unused_token) while isinstance(unused_token_id, list): unused_token_id = unused_token_id[0] return unused_token_id def get_mask_token_id(self): mask_token_id = self.encode(self.mask_token) while isinstance(mask_token_id, list): mask_token_id = mask_token_id[0] return mask_token_id
seed_text = ['reach highest level devastation'] # frase inicial seed_tokens = tokenizer.texts_to_sequences(seed_text)[0] # substitui palavras por tokens # preenche sequencia com zeros para ter o comprimento adequado pra rede tokens_x = pad_sequences([seed_tokens], maxlen=seq_len, ) tokens_x = to_categorical(tokens_x, num_classes=vocab_size) # one hot pred_y = model.predict(tokens_x) # preve probabilidades para a proxima palavra def sample_word(pred_y, temperature=1.0): pred_y = pred_y / temperature # 'força' das probabilidades pred_token = tf.random.categorical(pred_y, 1).numpy() return pred_token # token de saída next_token = sample_word(pred_y) # realiza a inversão de token para palavra next_word = tokenizer.sequences_to_texts(next_token) print('Proximo token: ', next_token, '-->', next_word) """# **b)** *Após o treinamento, exiba pelo menos 5 exemplos de textos dados de entrada, e do texto gerado em seguida pela rede treinada. Para cada exemplo, gere pelo menos 10 palavras consecutivamente.*""" def sample_word(pred_y, temperature=1.0): pred_y = pred_y / temperature # 'força' das probabilidades pred_token = tf.random.categorical(pred_y, 1).numpy() return pred_token # token de saída seed_text = ['true destroy unholy coalition', 'there amazing wwii story', 'lightning hotter surface sun', 'mom setting bathroom curfew', 'retweet second receive goodnews'] # frase inicial for rep in range(5): for rep2 in range(10):
class SimulacrumGenerator: def __init__(self, max_words=1000, max_len=50, num_epochs=10, batch_size=128): self.simulacrum_name = os.getenv("SIMULACRUM_NAME") self.num_epochs = num_epochs self.batch_size = batch_size self.max_words = max_words self.max_len = max_len self.tok = Tokenizer(num_words=max_words) self.processor = DataProcessor(os.getenv("SIMULACRUM_NAME")) self.model = self.architecture() self.model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) def architecture(self): inputs = Input(name='inputs', shape=[self.max_len]) layer = Embedding(self.max_words, self.max_len, input_length=self.max_len)(inputs) layer = LSTM(64)(layer) layer = Dense(self.max_len, name='out_layer')(layer) layer = Activation('relu')(layer) model = Model(inputs=inputs, outputs=layer) return model def architecture2(self): inputs = Input(name='inputs', batch_shape=(self.batch_size, self.max_len)) layer = Embedding(self.max_words, self.max_len)(inputs) layer = GRU(1024, recurrent_initializer='glorot_uniform', stateful=True)(layer) layer = Dense(self.max_len, name='out_layer')(layer) # layer = Activation('relu')(layer) model = Model(inputs=inputs, outputs=layer) return model def tokenize_sentences(self, sentences): sequences = self.tok.texts_to_sequences(sentences) # sequences = [] # for vector in self.tok.texts_to_sequences(sentences): # sequences.append(np.interp(vector, (0, self.max_words), (0, 1))) return sequence.pad_sequences(sequences, maxlen=self.max_len) def detokenzie(self, vectors): return self.tok.sequences_to_texts((vectors*10000).astype("int")) # return self.tok.sequences_to_texts(np.interp(vectors, (0, 1), (0, self.max_words)).astype("int")) def create_inputs(self, sentences=None): if sentences is None: self.processor.extract() sentences = self.processor.received self.tok.fit_on_texts(sentences) # self.max_words = len(sentences) return self.tokenize_sentences(sentences) def generate(self, sentences=None): if sentences is None: inputs = self.create_inputs() else: inputs = self.create_inputs(sentences) return np.array(self.model.predict(inputs)), np.zeros(len(inputs)) def train(self, callbacks=None): # cb = [EarlyStopping(monitor='val_loss', min_delta=0.0001)] cb=[] if callbacks is not None: cb.extend(callbacks) self.processor.extract() train_X = [] train_y = [] for pair in self.processor.pairs: train_X.append(self.processor.received[pair[1]]) train_y.append(self.processor.sent[pair[0]]) self.model.fit(self.create_inputs(train_X), self.create_inputs(train_y), epochs=self.num_epochs, batch_size=self.batch_size, validation_split=0.2, callbacks=cb) # generator = SimulacrumGenerator() # outputs, y = generator.generate() # print(outputs[0], generator.tokenize_sentences(generator.processor.received)[0]) # print(generator.detokenzie(outputs))
class ComparativeNeuralClassifier(CW): def __init__(self): # a list of tuples of (type, data_clean, true_label) self.labelled_data = [] self.labelled_validation_data = [] self.labels = set() self.models = dict() self.tokenizer = None #force def add_data(self, file_id: str, tokenized_file: str, true_label: int): """ :param file_id: a hashable ID for this particular file :param tokenized_file: a :param true_label: :return: None """ # CURRENTLY NOT TAKING IN PRE-TOKENIZED FILE, DISCUSS WITH TEAM ABOUT ALTERING CLASSIFIER INTERFACES self.labels.add(true_label) self.labelled_data.append((file_id, tokenized_file, true_label)) def add_validation_data(self, file_id: str, data: str, true_label: int): """ :param file_id: :param data: :param true_label: :return: """ self.labelled_validation_data.append((file_id, data, true_label)) def get_data(self): """ :return: A structure [(file_id, tokenized_file, true_label),...] for all data_clean added to this classifier with the add_data method """ raise NotImplementedError def train(self): """ This classifier object will train on all the data_clean that has been added to it using the adddata method :return: """ # i want to use bagging # create the tokenizer self.tokenizer = Tokenizer(num_words=constants.MAX_NUMBER_TOKENS) training_data = [text for _, text, _ in self.labelled_data] self.tokenizer.fit_on_texts(training_data) # now build our training data_clean X_train = self.tokenizer.texts_to_sequences(training_data) X_validation = self.tokenizer.texts_to_sequences( [text for _, text, _ in self.labelled_validation_data]) X_train, y_train = data_slicer.slice_data( X_train, [y for _, _, y in self.labelled_data], slice_length=constants.SLICE_LENGTH, overlap_percent=constants.SLICE_OVERLAP) X_validation, y_validation = data_slicer.slice_data( X_validation, [y for _, _, y in self.labelled_validation_data], slice_length=constants.SLICE_LENGTH, overlap_percent=constants.SLICE_OVERLAP) # pad them as necessary X_train = np.array([ np.array(x) for x in pad_sequences( X_train, padding="post", maxlen=constants.SLICE_LENGTH) ]) X_validation = np.array( pad_sequences(X_validation, padding="post", maxlen=constants.SLICE_LENGTH)) # force change # get our glove embeddings glove = load_glove(constants.GLOVE_FILE, self.tokenizer.word_index) # compute some neural_constants vocab_size = len(self.tokenizer.word_index) + 1 for label in self.labels: # set model parameters self.models[label] = Sequential() model_layers = [ # must have these two layers firsts layers.Embedding(vocab_size, constants.GLOVE_DIMENSIONS, weights=[glove], input_length=constants.SLICE_LENGTH, trainable=False), layers.GlobalMaxPool1D(), # now we have some options layers.Dense(20, activation="relu"), layers.Dense(15, activation="sigmoid"), # layers.Dense(10, activation="sigmoid"), # probably want a final sigmoid layer to get smooth value in range (0, 1) layers.Dense(1, activation="sigmoid") ] # add them in for layer in model_layers: self.models[label].add(layer) self.models[label].compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]) y_train_binary = [1 if l == label else 0 for l in y_train] # now we fit (can take a while) self.models[label].fit(X_train, y_train_binary, epochs=25, verbose=False, shuffle=True, validation_data=(X_validation, y_validation), batch_size=10) predictions = dict() stats = dict() for label in self.labels: predictions[label] = self.models[label].predict(X_validation, verbose=False) for label in self.labels: stats[label] = { "mean": np.mean(predictions[label]), "std": np.std(predictions[label]), "max": np.max(predictions[label]), "min": np.min(predictions[label]) } texts = self.tokenizer.sequences_to_texts(X_validation) sorted_labels = sorted(list(self.labels)) ncorrect = [0] * 4 n = 0 with open('classifiers/neural/cnc.csv', 'w', newline='\n') as csvfile: csvw = csv.writer(csvfile) for i in range(len(y_validation)): outputs = [predictions[label][i][0] for label in sorted_labels] zscores = [(predictions[label][i][0] - stats[label]["mean"]) / stats[label]["std"] for label in sorted_labels] normalized = [ (predictions[label][i][0] - stats[label]["min"]) / stats[label]["max"] for label in sorted_labels ] pred = [ np.argmax([(outputs[j] + zscores[j]) for j in range(len(outputs))]), np.argmax(outputs), np.argmax(zscores), np.argmax(normalized) ] n += 1 for j in range(len(pred)): if pred[j] == y_validation[j]: ncorrect[j] += 1 row = [y_validation[i]] \ + normalized \ + outputs \ + zscores \ + pred \ + [texts[i]] csvw.writerow(row) print(ncorrect) print([x / n for x in ncorrect]) print(n) def predict(self, tokenized_file: str, minimum_confidence=.8): """ :param tokenized_file: the array containing the ordered, sanitized word tokens from a single file :param minimum_confidence: the minimum confidence level required to the classifier to label a data_clean point as any given class. Only used by applicable classifiers. :return: a list of tuples of [(class label, confidence)] for each class label where confidence > minimum_confidence. Confidence will be 1 for classifiers where confidence is not a normally used feature. """ raise NotImplementedError
model.add(Embedding(vocab_size,embedding_size,input_length=maxlen)) model.add(LSTM(256)) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(512,activation='relu')) model.add(Dropout(0.5)) model.add(BatchNormalization()) model.add(Dropout(0.5)) model.add(Dense(3,activation = 'softmax')) # review the model model.compile(loss= 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy']) print(model.summary()) # begin training on dataset batch_size = 128 num_epochs = 5 model.fit(x_train1,y_train1,validation_data= (x_val,y_val),batch_size = batch_size, epochs= num_epochs) # check accuracy on test data scores = model.evaluate(x_test,y_test,verbose=0) print("accuracy:",str(scores[1])) u=8 print(tk.sequences_to_texts(x_test[u:u+1])) print(model.predict(x_test[u:u+1]))
class FastTextModel(SupervisedBaseModel): def __init__(self, task): super(FastTextModel, self).__init__(task) self.args = task.args self.epochs = 15 self.max_len = 50 self.batch_size = 32 self.max_features = 5000 self.embeddings_dim = self.args.embeddings_size self.embeddings_matrix = None self.ngram_range = 1 self.tokenizer = Tokenizer(num_words=self.max_features) self.model = None self.token_indice = None self.num_labels = len(self.args.labels) def build_model(self): print('Build model...') model = Sequential() # we start off with an efficient embedding layer which maps # our vocab indices into embedding_dims dimensions weights = None if self.embeddings_matrix is None else [ self.embeddings_matrix ] model.add( Embedding( self.max_features, self.embeddings_dim, input_length=self.max_len, #trainable=False, weights=weights, mask_zero=True), ) # we add a GlobalAveragePooling1D, which will average the embeddings # of all words in the document model.add(GlobalAveragePooling1D()) # We project onto a single unit output layer, and squash it with a sigmoid: model.add(Dense(2, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def show_text_info(self, X_text): print(len(X_text), 'text sequences') X_text_lens = list(map(len, X_text)) print('Average sequence length: {}'.format( np.mean(X_text_lens, dtype=int))) print('Max sequence length: {}'.format(np.max(X_text_lens))) def add_ngrams(self, X_text): if self.ngram_range == 1: return X_text if self.token_indice is None: print('Adding {}-gram features'.format(self.ngram_range)) # Create set of unique n-gram from the training set. ngram_set = set() for input_list in X_text: for i in range(2, self.ngram_range + 1): set_of_ngram = create_ngram_set(input_list, ngram_value=i) ngram_set.update(set_of_ngram) # Dictionary mapping n-gram token to a unique integer. # Integer values are greater than max_features in order # to avoid collision with existing features. start_index = self.max_features + 1 self.token_indice = { v: k + start_index for k, v in enumerate(ngram_set) } indice_token = {self.token_indice[k]: k for k in self.token_indice} # max_features is the highest integer that could be found in the dataset. self.max_features = np.max(list(indice_token.keys())) + 1 # Augmenting x_train and x_test with n-grams features X_text = add_ngram(X_text, self.token_indice, self.ngram_range) self.show_text_info(X_text) return X_text def fit_text(self, X_text, y=None): X_unlabeled = self.dataset.X_train_unlabeled.values X_unlabeled_text = X_unlabeled[:, self.args.TEXT_COL] X = np.append(X_text, X_unlabeled_text, axis=0) #X = self.preprocess_text(X) self.tokenizer.fit_on_texts(X) X = self.tokenizer.texts_to_sequences(X) X = self.tokenizer.sequences_to_texts(X) self.text_rep_model = self.build_fit_w2v(X) def transform_text(self, X_text): X = self.tokenizer.texts_to_sequences(X_text) X = self.tokenizer.sequences_to_texts(X) X = self.transform_text_to_w2v(self.text_rep_model, X) return X def preprocess_text(self, X_text): self.tokenizer.fit_on_texts(X_text) num_words = len(self.tokenizer.word_index) #self.max_features = np.minimum(self.max_features, num_words) + 1 # add padding self.max_features = num_words + 1 #add paddings self.embeddings_matrix = get_embedding_vectors( self.args.embeddings_path, self.tokenizer.word_index, self.max_features, self.embeddings_dim) X_text = self.tokenizer.texts_to_sequences(X_text) self.show_text_info(X_text) X_text = self.add_ngrams(X_text) self.max_len = int(np.max(list(map(len, X_text)))) X = sequence.pad_sequences(X_text, maxlen=self.max_len) return X def train(self, X, y): print('TRAINING') X, y = self.augment_instances(X, y) # convert to sequences X_text = X[:, self.args.TEXT_COL] X_text = self.preprocess_text(X_text) X = X_text # todo: add other features self.model = self.build_model() self.model.fit( X, y, batch_size=self.batch_size, epochs=self.epochs, #validation_data=(x_test, y_test) ) def predict(self, X): print('PREDICT') X_text = X[:, self.args.TEXT_COL] X_text = self.tokenizer.texts_to_sequences(X_text) self.show_text_info(X_text) X_text = self.add_ngrams(X_text) X = sequence.pad_sequences(X_text, maxlen=self.max_len) y = self.model.predict(X, verbose=1) y = (y > 0.5).astype(int) return y
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) num_train = int(len(encoder_inputs) * 0.9) train_generator = SequenceGenerator(encoder_inputs[0:num_train], decoder_inputs[0:num_train], decoder_targets[0:num_train], args.batch_size, max_len_target, args.hidden_dim_decoder, num_words_output) validation_generator = SequenceGenerator(encoder_inputs[num_train:], decoder_inputs[num_train:], decoder_targets[num_train:], args.batch_size, max_len_target, args.hidden_dim_decoder, num_words_output) print('Start training') callbacks = [TensorBoard(os.path.join(args.logs_dir, 'attention-{0}'.format(datetime.now().isoformat().replace(':','-').split('.')[0]))), ModelCheckpoint(os.path.join(args.models_dir, 'weights.{epoch:02d}-{val_loss:.2f}.h5'), save_best_only=True)] r = model.fit_generator( generator=train_generator, steps_per_epoch=len(train_generator), epochs=args.epochs, validation_data=validation_generator, validation_steps=len(validation_generator), callbacks=callbacks, initial_epoch=args.initial_epoch ) for i in range(min(20, len(encoder_inputs))): output = inference_model.predict(np.array(encoder_inputs[i:i+1]), tokenizer_outputs.word_index['<sos>'], tokenizer_outputs.word_index['<eos>']) output_sentences = tokenizer_outputs.sequences_to_texts([list(output)]) print(input_texts[i], '<qa>', output_sentences[0])
tokenizer = Tokenizer() tokenizer.fit_on_texts(review_train) tokenizer.num_words=2000 X_train = tokenizer.texts_to_sequences(review_train) X_test = tokenizer.texts_to_sequences(review_test) vocab_size = len(tokenizer.word_index) + 1 # Adding 1 because of reserved 0 index maxlen = 15 X_train = pad_sequences(X_train, padding='post', maxlen=maxlen) X_test = pad_sequences(X_test, padding='post', maxlen=maxlen) X_train = tokenizer.sequences_to_texts(X_train) X_test = tokenizer.sequences_to_texts(X_test) tfidf_vectorizer = TfidfVectorizer() tfidf_vectorizer.fit(X_train) X_train = tfidf_vectorizer.transform(X_train) X_test = tfidf_vectorizer.transform(X_test) X_train=X_train.toarray() X_test=X_test.toarray() X_train=X_train.reshape(130967, 1979,1) X_test=X_test.reshape(32742, 1979,1)
# vectorizing text, turning each text into sequence of integers tokenizer = Tokenizer(lower=False) tokenizer.fit_on_texts(X) # convert to sequence of integers X = tokenizer.texts_to_sequences(X) # convert to numpy arrays X = np.array(X) y = np.array(y) # padding sequences at the beginning of each sequence with 0's to SEQUENCE_LENGTH X = pad_sequences(X, maxlen=SEQUENCE_LENGTH) y = [label2int[label] for label in y] y = np.asarray(y, dtype=np.float32) XSpamText = tokenizer.sequences_to_texts(X[y == 1]) XHamText = tokenizer.sequences_to_texts(X[y == 0]) analysis = Analysis(XHamText, XSpamText) # split and shuffle X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7) split_frac = 0.5 # 50% validation, 50% test split_id = int(split_frac * len(X_test)) X_val, X_test = X_test[:split_id], X_test[split_id:] y_val, y_test = y_test[:split_id], y_test[split_id:] train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) val_data = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
print("[ INFO ][ Features loaded ][ Length: {} ]".format(len(features))) lst = list() for v in caps.values(): [lst.append(words) for words in v] maxLen = max([len(words.split()) for words in lst]) # longest caption tknzr = Tokenizer() tknzr.fit_on_texts(lst) vocab_size = len(tknzr.word_index) + 1 encoder_in, decoder_in, decoder_out = list(), list(), list() for img, sents in caps.items(): for cap in sents: seq = tknzr.sequences_to_texts([cap])[0] for i in range(1, len(seq)): in_seq, out_seq = seq[:i], seq[i] in_seq = pad_sequences([in_seq], maxlen=maxLen)[0] out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] encoder_in.append(features[img]) decoder_in.append(in_seq) decoder_out.append(out_seq) inputs1 = Input(shape=(None, 25088)) en1 = Dropout(0.3)(inputs1) en2 = Dense(latent_dims, activation='relu')(en1) encoder_outputs, state_h, state_c = LSTM(latent_dims, return_state=True)(en1) encoder_states = [state_h, state_c]
from keras.preprocessing.text import Tokenizer text = "나는 맛있는 밥을 먹었다" token = Tokenizer() token.fit_on_texts([text]) print(token.word_index) x = token.texts_to_sequences([text]) print(x) print(token.sequences_to_texts(x)) from keras.utils import to_categorical word_size = len(token.word_index) + 1 x = to_categorical(x, num_classes=word_size) print(x)
def model(suffix="", suffix_fre=""): for round in range(0, 2): rtest = xlrd.open_workbook(filename="切割" + suffix + "test" + suffix_fre + ".xls") rtrain = xlrd.open_workbook(filename="切割" + suffix + "train" + suffix_fre + ".xls") r_vocall1 = xlrd.open_workbook(filename="pre处理" + suffix + "test" + suffix_fre + ".xls") r_vocall2 = xlrd.open_workbook(filename="pre处理" + suffix + "train" + suffix_fre + ".xls") sheet_test = rtest.sheet_by_index(0) sheet_train = rtrain.sheet_by_index(0) sheet1_vocall = r_vocall1.sheet_by_index(0) sheet2_vocall = r_vocall2.sheet_by_index(0) invocal1 = sheet1_vocall.col_values(4) invocal2 = sheet2_vocall.col_values(4) for i in range(0, len(invocal1)): if len(invocal1[i]) == 0: invocall = invocal1[:i] print("1") break if i == len(invocal1) - 1: invocall = invocal1 for i in range(0, len(invocal2)): if len(invocal2[i]) == 0: print("1") invocal2 = invocal2[:i] break for i in invocal2: if i not in invocall: invocall.append(i) print(len(invocall)) vocall_size = len(invocall) if round == 1: ex_tag = sheet_test.col_values(6) xtrain = sheet_train.col_values(2 + round * 3) ztrain = sheet_train.col_values(0 + round * 3) ytrain = sheet_train.col_values(1 + round * 3) xtest = sheet_test.col_values(2 + round * 3) ztest = sheet_test.col_values(0 + round * 3) ytest = sheet_test.col_values(1 + round * 3) for i in range(0, len(xtrain)): if len(xtrain[i]) == 0: xtrain = xtrain[:i] ztrain = ztrain[:i] ytrain = ytrain[:i] break for i in range(0, len(xtest)): if len(xtest[i]) == 0: xtest = xtest[:i] ytest = ytest[:i] ztest = ztest[:i] break print(round * 3) print(len(xtrain), "xtrain") print(len(ytrain), "ytrain") print(len(xtest), "xtest") print(len(ytest), "ytest") if round == 1: other = sheet_train.cell(0, 13).value other = int(other) print(other) if other == 1: xtrain = xtrain + sheet_train.col_values(9) ytrain = ytrain + sheet_train.col_values(8) ztrain = ztrain + sheet_train.col_values(7) for i in range(0, len(xtrain)): if len(xtrain[i]) == 0: xtrain = xtrain[:i] ztrain = ztrain[:i] ytrain = ytrain[:i] break tokenizer = Tokenizer(num_words=vocall_size) tokenizer.fit_on_texts(invocall) xtrain = tokenizer.texts_to_sequences(xtrain) xtest = tokenizer.texts_to_sequences(xtest) maxlen = 0 for i in xtrain: if len(i) > maxlen: maxlen = len(i) for i in xtest: if len(i) > maxlen: maxlen = len(i) print(maxlen, "maxlen") xtrain = pad_sequences(xtrain, padding='post', maxlen=maxlen) xtest = pad_sequences(xtest, padding='post', maxlen=maxlen) print(len(ytrain), len(xtrain)) print(len(ytest), len(xtest)) for i in range(0, len(ytrain)): ytrain[i] = int(ytrain[i]) for i in range(0, len(ytest)): ytest[i] = int(ytest[i]) embedding_size = 150 hidden_layer_size = 64 batch_size = 128 num_epochs = 3 model = Sequential() model.add(Embedding(vocall_size, embedding_size, input_length=maxlen)) model.add(AT(25, 150)) model.add(SpatialDropout1D(0.2)) model.add(LSTM(hidden_layer_size, dropout=0.2, recurrent_dropout=0.2)) model.add(Dense(10)) model.add(Dense(1)) model.add(Activation("sigmoid")) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.summary() history = model.fit(xtrain, ytrain, epochs=7, batch_size=64) loss, accuracy = model.evaluate(xtest, ytest) print(loss, accuracy) """ plt.subplot(211) plt.title("Accuracy"+suffix) plt.plot(history.history['acc'],color="g",label="Train") plt.legend(loc="best") plt.subplot(212) plt.title("Loss") plt.plot(history.history['loss'],color="g",label="Train") plt.legend(loc="best") plt.tight_layout() plt.show() """ w = xlwt.Workbook() sheet2 = w.add_sheet("准备文件", cell_overwrite_ok=True) sheet2.write(0, 8, "predict") sheet2.write(0, 9, "ytest") sheet2.write(0, 10, "xtest") sheet2.write(0, 11, "ex_tag") sheet2.write(0, 4, "loss") sheet2.write(1, 4, loss) sheet2.write(0, 5, "acc") sheet2.write(1, 5, accuracy) ypred = model.predict_classes(xtest, 1) xtest = tokenizer.sequences_to_texts(xtest) for index in range(0, len(ypred)): sheet2.write(index, 0, int(ypred[index][0])) sheet2.write(index, 1, ytest[index]) sheet2.write(index, 2, xtest[index]) if round == 1: sheet2.write(index, 3, ex_tag[index]) if round == 0: w.save("result切割" + suffix + suffix_fre + "at.xls") else: w.save("result扩充" + suffix + suffix_fre + "at.xls")