def prepare_data(): data = pd.read_csv("dataset.csv") X = data.loc[:, 'sub'] y = data.loc[:, 'severity'] train_X, train_y, test_X, test_y = train_test_split(X, y, test_size=0.2, random_state=42) t = Tokenizer() t.fit_on_texts(train_X) vocab_size = len(t.word_index) + 1 encoded_train_X = t.text_to_sequences(train_X) encoded_test_X = t.text_to_sequence(test_X) max_len = 20 padded_train_X = pad_sequences(encoded_train_X, maxlen=max_len, padding='post') padded_test_X = pad_sequences(encoded_test_X, maxlen=max_len, padding='post') embedding_index = dict() f = open('all_embeddings.txt') #custom embedding matrix for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:]) embedding_index[word] = coefs f.close() embedding_matrix = np.zeros([vocab_size, max_len]) for word, i in t.word_index.items(): embedding_vector = embedding_index.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector return vocab_size, padded_train_X, train_y, padded_test_X, test_y, embedding_matrix
data.head(5) #提取关键字 def key_word_extract(row): return " ".join( analyse.extract_tags(texts, topK=50, withWeight=False, allowPOS=())) data['key_extract'] = data.Job_Description.apply(key_word_extract) data.head(5) # -------------------------- 3、分词和提取关键词 -------------------------------- # -------------------------- 4、建立字典,并使用 -------------------------------- token = Tokenizer(num_words=2000) token.fit_on_texts(job_detail_pd['key_extract']) #按单词出现次数排序,排序前2000的单词会列入词典中 Job_Description_Seq = token.text_to_sequences( job_detail_pd['key_extract']) #将文字转换为数字列表 Job_Description_Seq_Padding = sequence.pad.sequences( Job_Description_Seq, maxlen=50) #截长补短,让所有数字列表长度均为50 x_train = Job_Description_Seq y_train = data['label'].tolist #数组转列表 # -------------------------- 4、建立字典,并使用 -------------------------------- # -------------------------- 5、训练模型 --------------------------------------- #/--------------------------method1-class-----------------------------------------/# class Model1(nn.Model): def __init__(self): super(Model1, self).__init() self.embedding = torch.nn.Embedding(output_dim=32, intput_dim=2000, input_length=50)
class Network: def __init__(self, env, maxlen=50, sample_size=300000, num_words=100000): texts = env.env_core.get_all_snippets() self.tokenizer = Tokenizer(num_words=num_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', oov_token='UNK') if len(texts) < sample_size: sample_size = 50000 #int(len(texts)/2) self.tokenizer.fit_on_texts(random.sample(texts, sample_size)) self.maxlen = maxlen self.voca_size = num_words self.voca = self.tokenizer.word_index.keys() #self.tensor_shape = tensor_shape self.model = self._create_model(self.maxlen, self.voca_size) self.model.compile(loss=keras.losses.mean_squared_error, optimizer=keras.optimizers.adam(), metrics=['accuracy']) #TODO PA: we can play with the NN model. The used model in the MIT paper is: linear, RELU, linear, RELU, linear """ our NN model for predicting Q(s,a): Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 28) 0 _________________________________________________________________ dense_1 (Dense) (None, 10) 280 _________________________________________________________________ dropout_1 (Dropout) (None, 10) 0 _________________________________________________________________ dense_2 (Dense) (None, 10) 110 _________________________________________________________________ dropout_2 (Dropout) (None, 10) 0 _________________________________________________________________ dense_3 (Dense) (None, 1) 11 ================================================================= """ @staticmethod def _create_model(maxlen, voca_size): input_ = Input(shape=(maxlen, ), dtype='int32') action_ = Input(shape=(5, )) state_plus_ = Input(shape=(5, )) embeddings_ = Embedding(voca_size, 64)(input_) lstm_ = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(embeddings_) dense_1 = Dense(32)(lstm_) concat_ = Concatenate(axis=-1)([dense_1, state_plus_]) dense_2 = Dense(32)(concat_) concat_2 = Concatenate(axis=-1)([dense_2, action_]) dense_3 = Dense(16)(concat_2) outputs_ = Dense(1, activation='linear')(dense_3) return Model(inputs=[input_, state_plus_, action_], outputs=outputs_) def pad_sequence(text): x_train = self.tokenizer.text_to_sequences([text]) pad_sequences(x_train, maxlen=self.maxlen) def fit(self, texts_actions, y_train, epochs, batch_size, callbacks=None): texts, state_plus, actions = texts_actions x_train = self.tokenizer.texts_to_sequences(texts) x_train = pad_sequences(x_train, maxlen=self.maxlen) if callbacks is None: hist = self.model.fit( [np.array(x_train), np.array(state_plus), np.array(actions)], np.array(y_train), # batch_size=batch_size, epochs=epochs, verbose=1) # could be 1 else: hist = self.model.fit( [np.array(x_train), np.array(state_plus), np.array(actions)], np.array(y_train), # batch_size=batch_size, epochs=epochs, callbacks=None, verbose=1) # could be 1 return hist.history def fit_generator(self, gen, steps_per_epoch, epochs): self.model.fit_generator( generator=gen, # steps_per_epoch=steps_per_epoch, # epochs=epochs, verbose=1) # Desc: (loss, accuracy) def evaluate(self, x_test, y_test): return self.model.evaluate(x_test, y_test, verbose=0) def save_weights(self, path): self.model.save_weights(filepath=path) pickle.dump(self.tokenizer, open("tokenizer.pickle", "wb")) def load_weights(self, path): self.model.load_weights(filepath=path, by_name=False) self.tokenizer = pickle.load(open("tokenizer.pickle", "rb")) def save_model(self, path): self.model.save(path) def load_model(self, path): keras.models.load_model(path) def predict(self, text_action): text, state_plus, action = text_action x_train = self.tokenizer.texts_to_sequences(text) x_train = pad_sequences(x_train, maxlen=self.maxlen) return self.model.predict( [np.array(x_train), np.array(state_plus), np.array(action)])
y = train_df[classes_to_predict].values #y_test_predicted = test_df[classes_to_predict].values processed_train_comments = [] for comment in raw_train_comments: processed_train_comments.append(preprocess_text(comment)) processed_test_comments = [] for comment in raw_test_comments: processed_test_comments.append(preprocess_text(comment)) tokenizer = Tokenizer(num_words = MAX_NB_WORDS) tokenizer.fit_on_texts(processed_train_comments + processed_test_comments) train_sequences = tokenizer.text_to_sequences(processed_train_comments) test_sequences = tokenizer.text_to_sequences(processed_test_comments) print('found %s tokens in text.' %(tokenizer.word_index)) train_data = pad_sequences(train_sequences, maxlen = MAX_SEQUENCE_LENGTH) final_test_data = pad_sequences(test_sequences, maxlen = MAX_SEQUENCE_LENGTH) print('shape of train_data(will be divided further into final_train_data + final_validation_data) ready for feeding to network is %s' %(train_data.shape)) print('shape of final_test_data ready for fedding to network is %s' %(final_test_data.shape)) print('shape of label(y) is %s' %(y.shape)) ################################################## ## preparing word embeddings.
#提取关键字 def key_word_extract(row): return " ".join( analyse.extract_tags(texts, topK=50, withWeight=False, allowPOS=())) job_detail_pd[ 'Job_Description_key_wprd'] = job_detail_pd.Job_Description.apply( key_word_extract) # -------------------------- 3、分词和提取关键词 --------------------------- # -------------------------- 4、建立字典,并使用 --------------------------- token = Tokenizer(num_words=2000) token.fit_on_texts( job_detail_pd['Job_Description_key_wprd']) #按单词出现次数排序,排序前2000的单词会列入词典中 Job_Description_Seq = token.text_to_sequences( job_detail_pd['Job_Description_key_wprd']) #将文字转换为数字列表 Job_Description_Seq_Padding = sequence.pad.sequences( Job_Description_Seq, maxlen=50) #截长补短,让所有数字列表长度均为50 x_train = Job_Description_Seq y_train = job_detail_pd['label'].tolist #数组转列表 # -------------------------- 4、建立字典,并使用 ---------------------------- # -------------------------- 5、训练模型 ----------------------------------- #/--------------------------method1-API------------------------------------- inputs = input(shape=(784, )) #层的实例是可以调用的,他以一个张量为参数,并且输出一个张量 x = Embedding(output_dim=32, intput_dim=2000, input_length=50)(inputs) x = Conv1D(256, 3, padding='same', activation='relu')(x) x = MaxPool1D(3, 3, padding='same')(x) x = Conv1D(3, 3, padding='same', activation='relu')(x) x = Flatten()(x) x = Dropout(0.3)(x)
from keras.preprocessing.text import Tokenizer samples = ['I study at CityU', 'I study at CityU at Seattle'] tokenizer = Tokenizer(num_words=1000) tokenizer.fit_on_texts(samples) sequences = tokenizer.text_to_sequences(samples) one_hot_results = tokenizer.text_to_matrix(samples, mode='binary') word_index = tokenizer.word_index print('Found %s unique tokesn: ' % len(word_index)) print('Sequences: ', sequences, '\n') print('word_index: ', tokenizer.word_index)
import keras.preprocessing.text as T from keras.preprocessing.text import Tokenizer text1='some thing to eat' text2='some thing to drink' texts=[text1,text2] print T.text_to_word_sequence(text1) #['some', 'thing', 'to', 'eat'] print T.one_hot(text1,10) #[7, 9, 3, 4] print T.one_hot(text2,10) #[7, 9, 3, 1] tokenizer = Tokenizer(num_words=10) tokenzier.fit_on_text(texts) print tokenizer.word_count #[('some', 2), ('thing', 2), ('to', 2), ('eat', 1), ('drink', 1)] print tokenizer.word_index #{'some': 1, 'thing': 2,'to': 3 ','eat': 4, drink': 5} print tokenizer.word_docs #{'some': 2, 'thing': 2, 'to': 2, 'drink': 1, 'eat': 1} print tokenizer.index_docs #{1: 2, 2: 2, 3: 2, 4: 1, 5: 1} print tokenizer.text_to_sequences(texts) #[[1, 2, 3, 4], [1, 2, 3, 5]] print tokenizer.text_to_matrix(texts) # [[ 0., 1., 1., 1., 1., 0., 0., 0., 0., 0.], [ 0., 1., 1., 1., 0., 1., 0., 0., 0., 0.]] import keras.preprocessing.sequence as S S.pad_sequences([[1,2,3]],10,padding='post') --------------------- 作者:vivian_ll 来源:CSDN 原文:https://blog.csdn.net/vivian_ll/article/details/80795139 版权声明:本文为博主原创文章,转载请附上博文链接!