def preprocess(train_content, train_label, test_content, test_label): tokenizer = Tokenizer(num_words=NUM_WORDS) tokenizer.fit_on_sequences(train_content) x_train_seq = tokenizer.texts_to_sequences(train_content) x_test_seq = tokenizer.texts_to_sequences(test_content) x_train = sequence.pad_sequences(x_train_seq, maxlen=MAX_LEN) x_test = sequence.pad_sequences(x_test_seq, maxlen=MAX_LEN) y_train = np.array(train_label) y_test = np.array(test_label) return x_train, y_train, x_test, y_test, tokenizer.word_index
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
def test_tokenizer(): texts = ['The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.'] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode)
def vectorize(self, data_set): tokenizer = Tokenizer() tokenizer.fit_on_texts(data_set) sequences = tokenizer.fit_on_sequences(data_set) # word_index = tokenizer.word_index data_feature = pad_sequences(sequences, maxlen=self.max_len) return data_feature
def test_tokenizer(): texts = [ 'The cat sat on the mat', 'The dog sat on the log', 'Dogs and cats living together' ] # num_words:处理的最大单词数量。被限制处理数据集中最常见的n个单词 tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) print('word_counts: ', tokenizer.word_counts) # 在训练期间出现的次数 print('word_docs: ', tokenizer.word_docs) # 在训练期间,单词出现在了几份文档中 print('word_index: ', tokenizer.word_index) # 排名索引 print('document_count: ', tokenizer.document_count) # 被训练的文档数 # 测试序列生成器 sequences = [] for seg in tokenizer.texts_to_sequences_generator(texts): sequences.append(seg) # 测试文本序列以矩阵的形式表达特征 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode) print(mode, " : ", matrix)
def model_setup(self): """ Sets up the model for generating a poem with a tokenizer, and splits the conversations into sequences. Void -> [Tupleof Tokenizer Int] """ tokenizer = Tokenizer() tokenizer.fit_on_texts(self.conversations) tokenizer.fit_on_sequences(self.conversations) # n-gram sequences self.input_sequences = [] for line in self.conversations: token_list = tokenizer.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i + 1] self.input_sequences.append(n_gram_sequence) max_sequence_len = max([len(x) for x in self.input_sequences]) self.input_sequences = np.array( pad_sequences(self.input_sequences, maxlen=max_sequence_len, padding='pre')) return (tokenizer, max_sequence_len)
publ_test = get_data() content_test = publ_test['content'] classes_test = publ_test['class'] # - - - - - - - - - - - - - - - - - - - - - - - - # Предобработка данных tk = Tokenizer() tk.fit_on_texts(content_test) textSequences_test = tk.texts_to_sequences(content_test) num_words = 80059 num_classes = 8 tk = Tokenizer(num_words=num_words) tk.fit_on_sequences(textSequences_test) X_test = tk.sequences_to_matrix(textSequences_test, mode='tfidf') y_test = keras.utils.to_categorical(classes_test, num_classes) # - - - - - - - - - - - - - - - - - - - - - - - - # Классификация новостей print('\n Результаты:') for i in range(len(X_test)): prediction = model.predict(np.array([X_test[i]])) pred = np.argsort(-prediction) print("[", i, "] ", publ_test['title'][i]) print("Тренировочная категория: ", kfu_classes[classes_test[i]]) print("Предсказанная категория: ", kfu_classes[pred[0][0]]) print("Вероятность класса: %.2f%%" % (prediction[0][pred[0][0]] * 100))
if label_type == "neg": labels.append(0) else: labels.append(1) # b.对数据进行分词 """ 使用预训练的词嵌入对训练数据很少的问题特别有用,将训练数据限定为前200个样本,因此需要读取200个样本之后,学习对电影评论进行分类 """ maxlen = 100 # 每条评论取前100个单词后截断原始评论 training_samples = 200 # 200条评论进行训练 validation_samples = 10000 # 10000条评论进行验证 max_words = 10000 # 只考虑所有评论中最常见的10000个单词 tokenizer = Tokenizer(num_words=max_words) tokenizer.fit_on_sequences(texts) sequences = tokenizer.texts_to_sequences(texts) word_index = tokenizer.word_index print("Found %s unique tokens." % (len(word_index))) data = pad_sequences(sequences, maxlen=maxlen) labels = np.asarray(labels) print("Shape of data tensor:", data.shape) print("Shape of label tensor:", labels.shape) indices = np.arange(data.shape[0]) # 将数据划分成训练集和验证集,但首先要打乱数据,因为原始的数据是好评在前,差评在后 np.random.shuffle(indices) data = data[indices] labels = labels[indices] x_train = data[:training_samples]
verbose=2) end_time = time.time() average_time_per_epoch = (end_time - start_time) / epochs print("avg sec per epoch:", average_time_per_epoch) # run simple linear regression to compare performance #based on grid search done by: #https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb #the tfidf vectors capture co-occurance statistics, think of each number representing how many times #a word occured in a text and scaled by word frequency tfidfTokenizer = Tokenizer(nb_words=max_features) tfidfTokenizer.fit_on_sequences(X_train.tolist()) X_train_tfidf = np.asarray( tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf")) X_test_tfidf = np.asarray( tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf")) #check tfidf matrix print(X_train_tfidf) print(X_train_tfidf.shape, X_test_tfidf.shape) from sklearn.linear_model import LogisticRegression model_tfidf_reg = LogisticRegression(random_state=0, C=0.001, penalty='l2', verbose=1)
def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_sequences(lines) return tokenizer
verbose=2) end_time = time.time() average_time_per_epoch = (end_time - start_time) / epochs print("avg sec per epoch:", average_time_per_epoch) # run simple linear regression to compare performance #based on grid search done by: #https://github.com/rasbt/python-machine-learning-book/blob/master/code/ch08/ch08.ipynb #the tfidf vectors capture co-occurance statistics, think of each number representing how many times #a word occured in a text and scaled by word frequency tfidfTokenizer = Tokenizer(nb_words=max_features) tfidfTokenizer.fit_on_sequences(X_train.tolist()) X_train_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_train.tolist(), mode="tfidf")) X_test_tfidf = np.asarray(tfidfTokenizer.sequences_to_matrix(X_test.tolist(), mode="tfidf")) #check tfidf matrix print(X_train_tfidf) print(X_train_tfidf.shape, X_test_tfidf.shape) from sklearn.linear_model import LogisticRegression model_tfidf_reg = LogisticRegression(random_state=0, C=0.001, penalty='l2', verbose=1) model_tfidf_reg.fit(X_train_tfidf, y_train) from sklearn.metrics import accuracy_score #calculate test and train accuracy print("train acc:", accuracy_score(y_test, model_tfidf_reg.predict(X_train_tfidf)))
from keras.layers import Dense from keras.layers import LSTM from keras.layers import Embedding from keras import optimizers import string from keras.layers import Dropout #PRE_PROCESSING THE CLEAN DATASET #lines = training_set.split('\n') training_set_clean = [ line.rstrip('\n') for line in open('training_set_clean.txt', encoding='ISO-8859-1') ] #lines=[l.split('\n') for l in training_set_clean] #import Tokenizer tokenizer = Tokenizer() tokenizer.fit_on_sequences(training_set_clean) tokenizer.fit_on_texts(training_set_clean) sequences = tokenizer.texts_to_sequences(training_set_clean) vocab_size = len(tokenizer.word_index) + 1 sequences = array(sequences) X_train = sequences[:, :-1] y_train = sequences[:, -1] y_train = to_categorical(y_train, num_classes=vocab_size) seq_length = X_train.shape[1] print(X_train[0]) #TRAIN THE MODEL regressor = Sequential() regressor.add(Embedding(vocab_size, 20, input_length=seq_length))
# Тренировочные данные tokenizer = Tokenizer() tokenizer.fit_on_texts(content_train) textSequences = tokenizer.texts_to_sequences(content_train) X_train, y_train, X_test, y_test = split_data(textSequences, classes_train, 0.9) total_words = len(tokenizer.word_index) print('В словаре {} слов'.format(total_words)) num_words = 80059 num_classes = 8 print(u'Векторизация...') tokenizer = Tokenizer(num_words=num_words) tokenizer.fit_on_sequences(X_train) X_train = tokenizer.sequences_to_matrix(X_train, mode='tfidf') tokenizer.fit_on_sequences(X_test) X_test = tokenizer.sequences_to_matrix(X_test, mode='tfidf') y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) epochs = 10 # количество эпох\итераций для обучения total_categories = 8 print(u'Строим классификатор...') model = Sequential() model.add(Dense(256, input_shape=(num_words, ))) model.add(Activation('relu')) model.add(Dropout(0.2)) model.add(Dense(total_categories))
def convModel(tweets, stances, tweets_test, stances_test): #General Parameters global max embeding_dim = 200 dropout_prob = (0.0, 0.5) batch_size = 64 num_epochs = 20 print('Fitting tokenizer') tokenizer = Tokenizer() tokenizer.fit_on_sequences(tweets + tweets2) max_length = max([len(s.split()) for s in tweets + tweets2]) print('max_length', max_length) vocab_size = len(tokenizer.word_index) + 1 #Train and test split print('Train and test split') x_train, x_test, y_train, y_test = train_test_split(tweets, stances, test_size=0.2) print('x_train: ', len(x_train), 'x_test', len(x_test)) #Training data #traindata = np.array(x_train) #testdata = np.array(x_test) trainTokens = tokenizer.texts_to_sequences(x_train) Xtrain = pad_sequences(trainTokens, maxlen=max_length, padding='post') XtestTokens = tokenizer.texts_to_sequences(x_test) Xtest = pad_sequences(XtestTokens, maxlen=max_length, padding='post') #============ TEST DATA ============================================= #testgroup = np.array(tweets_test) #testGroupTokens = tokenizer.texts_to_sequences(tweets_test) #XtestGroup = pad_sequences(testGroupTokens, maxlen=max_length, padding='post') #print('Xtrain padding: ', len(Xtrain), 'Xtest padding: ', len(Xtest), 'XtestGroup padding: ', len(XtestGroup)) #Convert stances to categorical output y_test = np_utils.to_categorical(y_test, num_classes=3) y_train = np_utils.to_categorical(y_train, num_classes=3) y_testGroup = np_utils.to_categorical(stances_test, num_classes=3) print('y_test: ', len(y_test), 'y_train: ', len(y_train), 'y_testGroup: ', len(stances_test)) print('Loading embeddings..') #load word2vec and create embedding layer wv_from_bin = KeyedVectors.load_word2vec_format(datapath('E:/glove/glove.twitter.27B.200dGINSIM.txt'),binary=False) embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items()) embedding_layer = Embedding(vocab_size, embeding_dim, weights=[embedding_vectors], input_length=max_length, trainable=False) #Create the model print('Create and compile the model..') model = createModelC(max_length, embedding_layer) model.compile(loss="categorical_hinge", optimizer="adam", metrics=[f1]) model.summary(85) print('Fitting the model..') history = model.fit(Xtrain, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(Xtest, y_test), verbose=2) print('History', history.history) # evaluate print('Predicting (training)..') ypred = model.predict(Xtest) print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100)) print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100)) print('Predicting (testing)..')
from keras.preprocessing.text import Tokenizer import numpy as np texts = [ 'The cat sat on the mat.', 'The dog sat on the log.', 'Dogs and cats living together.' ] tokenizer = Tokenizer(num_words=10) tokenizer.fit_on_texts(texts) sequences = [] for seq in tokenizer.texts_to_sequences_generator(texts): sequences.append(seq) assert np.max(np.max(sequences)) < 10 assert np.min(np.min(sequences)) == 1 tokenizer.fit_on_sequences(sequences) for mode in ['binary', 'count', 'tfidf', 'freq']: matrix = tokenizer.texts_to_matrix(texts, mode) print("texts:", texts) print("=> Found %s unique tokens <=" % len(tokenizer.word_index))
class SentimentLSTM: def __init__(self): self.tokenizer = Tokenizer(num_words=vocab_size) self.stop_words = [] self.model = None def load_stop_word(self, path='E:/dataset/NLP/stopwords'): with open(path, 'r') as f: for line in f: content = line.strip() self.stop_words.append(content.decode('utf-8')) def jieba_cut(self, line): lcut = jieba.lcut(line) cut = [x for x in lcut if x not in self.stop_words] cut = " ".join(cut) return cut def load_cuted_corpus(self, dir, input): f = open(dir + '/' + input, 'r') lines = f.readlines() texts = [] labels = [] for line in lines: fields = line.split() rate = int(fields[0]) if rate == 0 or rate == 3: continue elif rate < 3: rate = 0 else: rate = 1 cont = fields[1:] cont = " ".join(cont) texts.append(cont) labels.append(rate) self.tokenizer.fit_on_sequences(texts) f.close() return texts, labels def load_data(self): x, y = self.load_cuted_corpus('corpus', 'review.csv') x = self.tokenizer.texts_to_sequences(x) x = S.pad_sequences(x, maxlen=sentence_max_len) y = to_categorical(y, num_classes=2) return ((x[0:500000], y[0:500000]), (x[500000:], y[500000:])) def train(self, epochs=50): print('building model========================') self.model = SentimentLSTM.build_model() print('loading data===========================') (text_train, rate_train), (text_test, rate_test) = self.load_data() print("training===============================") self.model.fit(text_train, rate_train, batch_size=1000, epochs=epochs) self.model.save('') score = self.model.evaluate(text_test, rate_test) print(score) def load_trained_model(self, path): model = SentimentLSTM.build_model() model.load_weights(path) return model def predict_text(self, text): if self.model == None: self.model = self.load_trained_model(model_savepath) self.load_stop_word() self.load_cuted_corpus('corpus', 'review.csv') vect = self.jieba_cut(text) vect = vect.encode('utf-8') vect = self.tokenizer.texts_to_sequences([ vect, ]) print(vect) return self.model.predict_classed(S.pad_sequences(np.array(vect), 100)) def build_model(): model = Sequential() model.add(Embedding(vocab_size, 256, input_length=sentence_max_len)) model.add(Bidirectional(LSTM(128, implementation=2))) model.add(Dropout(0.5)) model.add(Dense(2, activation='relu')) model.compile('RMSprop', 'categorical_crossentropy', metrics=['accuracy']) return model
target = [d.topic for d in docarr] # text_path = d_class.text_file # label_path = d_class.gnd_file # with open(text_path) as f1, open(label_path) as f2: # data = [text.strip() for text in f1] # target = [int(label.rstrip('\n')) for label in f2.readlines()] # tokenizer = Tokenizer(char_level=False) # tokenizer.fit_on_texts(data) # sequences_full = tokenizer.texts_to_sequences(data) # tokenizer.fit_on_sequences(sequences_full) # word_index = tokenizer.word_index tokenizer = Tokenizer(char_level=False) tokenizer.word_index = word_index tokenizer.fit_on_sequences(sequences_full) seq_lens = [len(s) for s in sequences_full] MAX_SEQ_LEN = max(seq_lens) print("Total: %s short texts" % format(len(docarr), ","), ' %s unique tokens.' % len(word_index)) print("Average length: %d" % np.mean(seq_lens), ", Max length: %d" % max(seq_lens)) X = pad_sequences(sequences_full, maxlen=MAX_SEQ_LEN) y = target ################################################# # Preparing embedding matrix ################################################# EMBED_DIM = 300
all_participants_mix_stopwords = all_participants.copy() all_participants_mix_stopwords['answer'] = all_participants_mix_stopwords.apply(lambda row: text_to_wordlist(row.answer, remove_stopwords=False).split(), axis=1) words = [w for w in all_participants_mix['answer'].tolist()] words = set(itertools.chain(*words)) vocab_size = len(words) words_stop = [w for w in all_participants_mix_stopwords['answer'].tolist()] words_stop = set(itertools.chain(*words_stop)) vocab_size_stop = len(words_stop) windows_size = WINDOWS_SIZE tokenizer = Tokenizer(num_words=vocab_size) tokenizer.fit_on_texts(all_participants_mix['answer']) tokenizer.fit_on_sequences(all_participants_mix['answer']) all_participants_mix['t_answer'] = tokenizer.texts_to_sequences(all_participants_mix['answer']) def test_model(text, model): word_list = text_to_wordlist(text) list_of_words = word_list.split(" ") sequences = tokenizer.texts_to_sequences([word_list]) word_tokens = sequences[0] size = len(word_tokens) test_phrases = [] for i in range(size): tokens = word_tokens[i:min(i+windows_size,size)] test_phrases.append(tokens) sequences_input = test_phrases sequences_input = pad_sequences(sequences_input, value=0, padding="post", maxlen=windows_size)
def tokenizer_fit_xvals(t_xvals): t = Tokenizer(num_words=None, lower=False, oov_token="_NA") t.fit_on_sequences(t_xvals) return t
test content = d['CONTENT'].iloc[test idx] # In[05]: Jawaban Teori No 3 import numpy as np from sklearn.model_selection import train_test_split X, y = np.arange(10).reshape((5, 2)), range(5) X # In[06]: Jawaban No 5 from keras.preprocessing.text import Tokenizer ns = Tokenizer(num_words=2) yn = ["Jawaban No5", "yes", "Berhasil"] ns.fit_on_texts(yn) ns.fit_on_sequences(yn) ns.word_index # In[07]: Soal No 6 d_train_inputs = tokenizer.texts to matrix(train content,mode='t df') d_test_inputs = tokenizer.texts to matrix(test content, mode='t df') # In[08]: Jawaban No 7 d_train inputs = d_train_inputs/np.amax(np.absolute(d_train_inputs)) d_test_inputs = d_test_inputs/np.amax(np.absolute(d_test_inputs)) # In[09]: Jawaban No 8