def build_vocabulary(inFile, dtype, vocabFile): # 文本长度 200 MAX_LENGTH = 300 NB_CLASSES = 2 # 读入分词后文本 doc = MySentences(inFile, dtype, 'get_content') # 把原始文本映射到index processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5) processor.fit(doc) processor.save(vocabFile)
def make_vocab_processor(name,text,max_length,min_frequency): '''' generate vocab model ''' print('Making vocabulary model...') vp = VocabularyProcessor(max_length, min_frequency=min_frequency) vp = vp.fit(text) if name == None: return vp else: print('Saving vocabulary model to {}'.format(name)) vp.save(name) return vp
class DatasetVectorizer: def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] self.sentences_lengths = [ len(str(x).split(' ')) for x in list(raw_sentence_pairs) ] max_sentence_length = max(self.sentences_lengths) self.vocabulary = VocabularyProcessor(max_sentence_length) self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format( self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array( list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape( num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2
class DatasetVectorizer: def __init__(self, raw_sentence, model_dir, save_vocab=True): os.makedirs(model_dir, exist_ok=True) raw_sentence = raw_sentence.ravel() raw_sentence = [str(x) for x in list(raw_sentence)] self.sentence_length = [ len(str(x).split(' ')) for x in list(raw_sentence) ] max_sentence_length = max(self.sentence_length) self.vocabulary = VocabularyProcessor(max_sentence_length) if save_vocab: self.vocabulary.save('{}/vocab'.format(model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format( self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence): #num_instances, num_classes = raw_sentence.shape num_instances = raw_sentence.shape[0] num_classes = 1 raw_sentence = raw_sentence.ravel() for i, v in enumerate(raw_sentence): if v is np.nan: print(i, v) vectorized_sentence = np.array( list(self.vocabulary.transform(raw_sentence))) vectorized_sentence = vectorized_sentence.reshape( num_instances, num_classes, self.max_sentence_len) return vectorized_sentence[:, 0, :]
# 繁体转简体 text = HanziConv.toSimplified(document) # 英文转小写 text = text.lower() # 分词 yield list(cut(text)) # 序列长度填充或截取到100,删除词频<=2的词 vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer) # 创建词汇表,创建后不能更改 vocab.fit(DOCUMENTS) # 保存和加载词汇表 vocab.save('vocab.pickle') vocab = VocabularyProcessor.restore('vocab.pickle') # 文本转为词ID序列,未知或填充用的词ID为0 id_documents = list(vocab.transform(DOCUMENTS)) for id_document in id_documents: print(id_document) # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] # [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 # 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
class DatasetVectorizer: def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True): self.model_dir = model_dir os.makedirs(self.model_dir, exist_ok=True) if raw_sentence_pairs is None: self.restore() else: raw_sentence_pairs = raw_sentence_pairs.ravel() raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)] if char_embeddings: log('Chosen char embeddings.') self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)] else: log('Chosen word embeddings.') self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)] max_sentence_length = max(self.sentences_lengths) log('Maximum sentence length : {}'.format(max_sentence_length)) if char_embeddings: log('Processing sentences with char embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, tokenizer_fn=char_tokenizer, ) else: log('Processing sentences with word embeddings...') self.vocabulary = VocabularyProcessor( max_document_length=max_sentence_length, ) log('Sentences have been successfully processed.') self.vocabulary.fit(raw_sentence_pairs) if save_vocab: self.vocabulary.save('{}/vocab'.format(self.model_dir)) @property def max_sentence_len(self): return self.vocabulary.max_document_length @property def vocabulary_size(self): return len(self.vocabulary.vocabulary_._mapping) def restore(self): self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir)) def vectorize(self, sentence): return np.array(list(self.vocabulary.transform([sentence]))) def vectorize_2d(self, raw_sentence_pairs): num_instances, num_classes = raw_sentence_pairs.shape raw_sentence_pairs = raw_sentence_pairs.ravel() for i, v in enumerate(raw_sentence_pairs): if v is np.nan: print(i, v) vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs))) vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes, self.max_sentence_len) vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :] vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :] return vectorized_sentence1, vectorized_sentence2
class CNN(object): def __init__(self, batch_size=64): self.batch_size = batch_size self.number_of_classes = 2 self.X_train = [] self.X_test = [] self.Y_train = [] self.Y_test = [] self.max_words = None self.vocabProcessor = None self.cnn_model = None self.model = None self.test_x = [] self.test_y = [] def load_dataset_training(self, vocab_name, filename='datasetWithoutNeutral'): """ Load the dataset """ X, Y = load_csv('datasets/' + filename, target_column=2, columns_to_ignore=[0]) """ Count max words from the longest sentence """ self.max_words = max([len(x[0].split(" ")) for x in X]) """ Get vocabulare size from longest sentence """ self.vocabProcessor = VocabularyProcessor(self.max_words) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(self.vocabProcessor.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split( X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ self.X_train = pad_sequences(self.X_train, maxlen=self.max_words, value=0.) self.X_test = pad_sequences(self.X_test, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.Y_train = to_categorical(self.Y_train, nb_classes=self.number_of_classes) self.Y_test = to_categorical(self.Y_test, nb_classes=self.number_of_classes) self.vocabProcessor.save(vocab_name) def create_cnn_architecture_two_layers( self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy'): if len(filterSize) == 0: filterSize = [3, 4] """ Define input shape and create word embedding """ self.cnn_model = input_data(shape=[None, self.max_words], name='input') self.cnn_model = tflearn.embedding( self.cnn_model, input_dim=len(self.vocabProcessor.vocabulary_), output_dim=outputDim) """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """ conv1 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[0], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) conv2 = conv_1d(self.cnn_model, nb_filter=number_of_filters, filter_size=filterSize[1], padding=padding, activation=activation_function_convLayer, regularizer=regularizer) #conv3 = conv_1d(cnn_model, nb_filter = 128, filter_size = 5, padding = 'same', # activation = 'relu', regularizer = 'L2') self.cnn_model = merge([conv1, conv2], mode='concat', axis=1) """ Expand one dimension to fit the max_pooling layer """ self.cnn_model = tf.expand_dims(self.cnn_model, 1) self.cnn_model = global_max_pool(self.cnn_model) """ Instantiate dropout layer and specify dropout parameter """ self.cnn_model = dropout(self.cnn_model, dropouts) """ Instantiate fully connected layer and regression layer. """ self.cnn_model = fully_connected(self.cnn_model, self.number_of_classes, activation=activation_function_fc) self.cnn_model = regression(self.cnn_model, optimizer=optimizer, learning_rate=learning_rate, loss=loss_function, name='models/' + model_name) def train_and_save(self, model_name, tensorboard_verbose=0, tensorboard_dir='/logs/', nb_epochs=5, shuffle=True, show_metric=True): """ Instantiate Deep neural network model and start the training """ self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.fit(self.X_train, self.Y_train, n_epoch=nb_epochs, validation_set=(self.X_test, self.Y_test), shuffle=shuffle, show_metric=show_metric, batch_size=self.batch_size, run_id=model_name) """ Save the model """ self.model.save('models/' + model_name) def load_model(self, model_name, outputDim=300, number_of_filters=60, filterSize=[3, 4], padding='same', activation_function_convLayer='relu', regularizer='L2', dropouts=0.5, activation_function_fc='softmax', optimizer='adam', learning_rate=0.001, loss_function='categorical_crossentropy', tensorboard_verbose=0, tensorboard_dir='/logs/'): """ Has to pass the same values that the models were trained with. If the model was trained on default values, the parameters will pass it automatically. """ self.create_cnn_architecture_two_layers( model_name, outputDim, number_of_filters, filterSize, padding, activation_function_convLayer, regularizer, dropouts, activation_function_fc, optimizer, learning_rate, loss_function) self.model = tflearn.DNN(self.cnn_model, tensorboard_verbose=tensorboard_verbose, tensorboard_dir=tensorboard_dir) self.model.load('models/' + model_name) def load_test_dataset(self, filename='testDatasetWithOutNeuTwo', vocab_name='vocabProc'): """ Something is wrong with this function. Does not get the same result as before when loading in the new data... """ """ Load test dataset """ self.test_x, self.test_y = load_csv('datasets/' + filename, target_column=1) """ Get restored vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Encode pos, neu and neg to numbers """ labelEncoder = LabelEncoder() labelEncoder.fit(self.test_y) self.test_y = labelEncoder.transform(self.test_y) """ Change the list of sentences to a list of sequence of words """ self.test_x = np.array( list(self.vocabProcessor.transform([x[0] for x in self.test_x]))) """ Pad the sequences to fit the longest sentence """ self.test_x = pad_sequences(self.test_x, maxlen=self.max_words, value=0.) """ Convert labels to binary vector """ self.test_y = to_categorical(self.test_y, nb_classes=self.number_of_classes) def evaluate_model_performance(self): metrix_score = self.model.evaluate(self.test_x, self.test_y, batch_size=self.batch_size) return metrix_score def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentence]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(sentence) return pred_score def predict_list(self, list_of_sentences=[[''], ['']], vocab_name='vocabProc'): """ Load vocabulary processor """ self.vocabProcessor = VocabularyProcessor(self.max_words) self.vocabProcessor.restore(vocab_name) """ Transorm sentence to matrix of numbers """ sentence = np.array( list(self.vocabProcessor.transform([x[0] for x in sentece]))) sentence = pad_sequences( sentence, max_len=self.vocabProcessor.max_document_length, value=0.) """ Predict sentence """ pred_score = self.model.predict(list_of_sentences) return pred_score
labelEncoder.fit(Y) Y = labelEncoder.transform(Y) """ Change the list of sentences to a list of sequence of words """ X = np.array(list(vocab.fit_transform([x[0] for x in X]))) """ Split the datasets to training set and test test """ X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.10, random_state=7) """ Pad the sequences to fit the longest sentence """ X_train = pad_sequences(X_train, maxlen=max_words, value=0.) X_test = pad_sequences(X_test, maxlen=max_words, value=0.) """ Convert labels to binary vector """ Y_train = to_categorical(Y_train, nb_classes=2) Y_test = to_categorical(Y_test, nb_classes=2) vocab.save('vocabProc') """ Begin the creation of convolutional model """ """ Define input shape and create word embedding """ cnn_model = input_data(shape=[None, max_words], name='input') cnn_model = tflearn.embedding(cnn_model, input_dim=len(vocab.vocabulary_), output_dim=300) """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """ conv1 = conv_1d(cnn_model, nb_filter=60, filter_size=3, padding='same', activation='relu', regularizer='L2')
'Masterpiece', # 55 ] # select title and score lable from data and use it for traning x = data['title'] # game titles is transformed to lists of 15 numbers (one to one word) # use nltk for better transformation and predictions word_processor = VocabularyProcessor(15) x = np.array(list(word_processor.fit_transform(x))) y = [] for label in data['score_phrase']: y.append(ratings.index(label)) # save the trained word model to use it in the predict program word_processor.save("wordprocessor") # find number of items in each category def statistics(): numbersInCategory = [0] * len(ratings) for labelNo in y: numbersInCategory[labelNo] += 1 return numbersInCategory # EXPERIMENT # dublicate game titles in rating categories whith few examples # in my understandig of the math, well known examples (categories whith many examples) will have a stonger # weight and others down bacause of the partial derivative