def build_model(vocabFile, model_type='bilstm'): processor = VocabularyProcessor.restore(vocabFile) n_words = len(processor.vocabulary_) net = tflearn.input_data([None, 300]) net = tflearn.embedding(net, input_dim=n_words, output_dim=200) if model_type == 'bilstm': net = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(200), tflearn.BasicLSTMCell(200)) net = dropout(net, 0.5) elif model_type == 'lstm': net = tflearn.lstm(net, 200, dropout=0.5) net = dropout(net, 0.5) elif model_type == 'cnn': net = conv_model(net) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net, optimizer='adam', learning_rate=0.05, loss='categorical_crossentropy') return net
class SentimentLookup: net = tflearn.input_data ([None, 40]) net = tflearn.embedding (net, input_dim=12495, output_dim=128) net = tflearn.lstm (net, 128, dropout=0.8) net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression (net, optimizer='adam', learning_rate=0.001, loss='categorical_crossentropy') model = tflearn.DNN(net, tensorboard_verbose=0) model.load(MODEL) vp = VocabularyProcessor.restore(VOCAB) def _process_tweet(self, tweet = ""): cleaned = str(tweet).upper() cleaned = re.sub('&\w+;', '', cleaned) cleaned = re.sub('\'', '', cleaned) cleaned = re.sub('@\w+ ', 'USERNAME ', cleaned) cleaned = re.sub('[^A-Z ]', '', cleaned) cleaned = re.sub('[ ]+', ' ', cleaned) return cleaned.strip() def sentiment(self, data): if isinstance(data, str): query = [x for x in SentimentLookup.vp.transform([self._process_tweet(data)])] bad, good = SentimentLookup.model.predict(query).tolist()[0] return good data = map(self._process_tweet, data) query = [x for x in SentimentLookup.vp.transform(data)] return SentimentLookup.model.predict(query)[:,1]
def load_vocab_processor(name,max_length,min_frequency): ''' load model ''' print('Loading vocabulary model from {}'.format(name)) vp = VocabularyProcessor(max_length, min_frequency=min_frequency) vp = vp.restore(name) return vp
def __init__(self): if not exists(VOCABULARY_PATH): self._vocab = self._create_vocab() self._vocab.save(VOCABULARY_PATH) else: self._vocab = VocabularyProcessor.restore(VOCABULARY_PATH) self._model = self._create_model() if exists(MODEL_PATH + '.meta'): self._model.load(MODEL_PATH, True)
def text2npy(inFile, outFile, vocabFile, dtype): processor = VocabularyProcessor.restore(vocabFile) doc = MySentences(inFile, dtype, 'get_content') train_doc = list(processor.transform(doc)) # 可以使用 to_categorical 来实现onehot编码 # to_categorical(np.array(lable), NB_CLASSES)) if dtype == 'train': # 把标签做变换 lable = [] for y in MySentences(inFile, dtype, 'get_info'): lable.append(int(y)) y = np.array(lable) # 保存到文件 np.save(outFile, np.column_stack([train_doc, y])) elif dtype == 'test': np.save(outFile, train_doc) fw = open(outFile + "_doc.txt", 'w') for y in MySentences(inFile, dtype, 'get_info'): fw.write(y.encode('utf8') + "\n") fw.close()
text = HanziConv.toSimplified(document) # 英文转小写 text = text.lower() # 分词 yield list(cut(text)) # 序列长度填充或截取到100,删除词频<=2的词 vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer) # 创建词汇表,创建后不能更改 vocab.fit(DOCUMENTS) # 保存和加载词汇表 vocab.save('vocab.pickle') vocab = VocabularyProcessor.restore('vocab.pickle') # 文本转为词ID序列,未知或填充用的词ID为0 id_documents = list(vocab.transform(DOCUMENTS)) for id_document in id_documents: print(id_document) # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
class CNN(object): def __init__(self, batch_size=64): self.batch_size = batch_size self.number_of_classes = 2 self.X_train = [] self.X_test = [] self.Y_train = [] self.Y_test = [] self.max_words = None self.vocabProcessor = None self.cnn_model = None self.model = None self.test_x = [] self.test_y = [] def load_dataset_training(self, vocab_name, filename='datasetWithoutNeutral'): """ Load the dataset """ X, Y = load_csv('datasets/' + filename, target_column=2, columns_to_ignore=[0]) """ Count max words from the longest sentence """ self.max_words = max([len(x[0].split(" ")) for x in X]) """ Get vocabulare size from longest sentence """ self.vocabProcessor = VocabularyProcessor(self.max_words) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(self.vocabProcessor.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ self.X_train = pad_sequences(self.X_train, maxlen=self.max_words, value=0.) self.X_test = pad_sequences(self.X_test, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.Y_train = to_categorical(self.Y_train, nb_classes=self.number_of_classes) self.Y_test = to_categorical(self.Y_test, nb_classes=self.number_of_classes) self.vocabProcessor.save(vocab_name) def create_cnn_architecture_two_layers( self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy'): if len(filterSize) == 0: filterSize = [3, 4] """ Define input shape and create word embedding """ self.cnn_model = input_data(shape=[None, self.max_words], name='input') self.cnn_model = tflearn.embedding( self.cnn_model, input_dim=len(self.vocabProcessor.vocabulary_), output_dim=outputDim) """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """ conv1 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[0], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) conv2 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[1], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) #conv3 = conv_1d(cnn_model, nb_filter = 128, filter_size = 5, padding = 'same', # activation = 'relu', regularizer = 'L2') self.cnn_model = merge([conv1, conv2], mode='concat', axis=1) """ Expand one dimension to fit the max_pooling layer """ self.cnn_model = tf.expand_dims(self.cnn_model, 1) self.cnn_model = global_max_pool(self.cnn_model) """ Instantiate dropout layer and specify dropout parameter """ self.cnn_model = dropout(self.cnn_model, dropouts) """ Instantiate fully connected layer and regression layer. """ self.cnn_model = fully_connected(self.cnn_model, self.number_of_classes, activation=activation_function_fc) self.cnn_model = regression(self.cnn_model, optimizer=optimizer, learning_rate=learning_rate, loss=loss_function, name='models/' + model_name) def train_and_save(self, model_name, tensorboard_verbose=0, tensorboard_dir='/logs/', nb_epochs=5, shuffle=True, show_metric=True): """ Instantiate Deep neural network model and start the training """ self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.fit(self.X_train, self.Y_train, n_epoch=nb_epochs, validation_set=(self.X_test, self.Y_test), shuffle=shuffle, show_metric=show_metric, batch_size=self.batch_size, run_id=model_name) """ Save the model """ self.model.save('models/' + model_name) def load_model(self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy', tensorboard_verbose=0, tensorboard_dir='/logs/'): """ Has to pass the same values that the models were trained with. If the model was trained on default values, the parameters will pass it automatically. """ self.create_cnn_architecture_two_layers( model_name, outputDim, number_of_filters, filterSize, padding, activation_function_convLayer, regularizer, dropouts, activation_function_fc, optimizer, learning_rate, loss_function) self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.load('models/' + model_name) def load_test_dataset(self, filename='testDatasetWithOutNeuTwo', vocab_name='vocabProc'): """ Something is wrong with this function. Does not get the same result as before when loading in the new data... """ """ Load test dataset """ self.test_x, self.test_y = load_csv('datasets/' + filename, target_column=1) """ Get restored vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(self.test_y) self.test_y = labelEncoder.transform(self.test_y) """ Change the list of sentences to a list of sequence of words """ self.test_x = np.array( list(self.vocabProcessor.transform([x[0] for x in self.test_x]))) """ Pad the sequences to fit the longest sentence """ self.test_x = pad_sequences(self.test_x, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.test_y = to_categorical(self.test_y, nb_classes=self.number_of_classes) def evaluate_model_performance(self): metrix_score = self.model.evaluate(self.test_x, self.test_y, batch_size=self.batch_size) return metrix_score def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentence]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(sentence) return pred_score def predict_list(self, list_of_sentences=[[''], ['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentece]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(list_of_sentences) return pred_score
""" Load dataset and load the model for evaluation """ tf.reset_default_graph() """ Load the dataset """ #test_x, test_y = load_csv('testDatasetWithNeuOne', target_column = 1) #test_x, test_y = load_csv('testDatasetWithNeuTwo', target_column = 1) #test_x, test_y = load_csv('testDatasetWithOutNeuOne', target_column = 1) test_x, test_y = load_csv('testDatasetWithOutNeuTwo', target_column=1) """ Count max words from the longest sentence """ #max_words = max([len(x[0].split(" ")) for x in test_x]) max_words = 2132 """ Get vocabulare size from longest sentence """ vocab = VocabularyProcessor(max_words) vocab = vocab.restore('vocabProc') """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(test_y) test_y = labelEncoder.transform(test_y) """ Change the list of sentences to a list of sequence of words """ test_x = np.array(list(vocab.transform([x[0] for x in test_x]))) """ Pad the sequences to fit the longest sentence """ test_x = pad_sequences(test_x, maxlen=max_words, value=0.) """ Convert labels to binary vector """ test_y = to_categorical(test_y, nb_classes=2) #test_y= to_categorical(test_y, nb_classes = 3) """ Create the same neural network as the one that is going to be loaded. """ cnn_model = input_data(shape=[None, max_words], name='input') cnn_model = tflearn.embedding(cnn_model, input_dim=len(vocab.vocabulary_),
import tflearn from tflearn.data_utils import VocabularyProcessor import sys # get all arguments games_to_predict = sys.argv[1:] if len(games_to_predict) is 0: print("Type games to predict when you run the script af arguments") exit() # create and load vocal vector model word_processor = VocabularyProcessor(15) word_processor.restore("wordprocessor") # create and load ML model net = tflearn.input_data([None, 15]) net = tflearn.embedding(net, input_dim=10000, output_dim=128) net = tflearn.lstm(net, 128, dropout=.8) net = tflearn.fully_connected(net, 11, activation='softmax') net = tflearn.regression(net) # adam, 0.001 model = tflearn.DNN(net, tensorboard_verbose=0) model.load("model.tfl") # use labels for output ratings = [ 'Unbearable', # 72 'Disaster', # 4 'Awful', # 664 'Painful', # 340