示例#1
0
class DatasetVectorizer:
    def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            self.sentences_lengths = [
                len(str(x).split(' ')) for x in list(raw_sentence_pairs)
            ]
            max_sentence_length = max(self.sentences_lengths)
            self.vocabulary = VocabularyProcessor(max_sentence_length)
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()

        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)

        vectorized_sentence_pairs = np.array(
            list(self.vocabulary.transform(raw_sentence_pairs)))

        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(
            num_instances, num_classes, self.max_sentence_len)

        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
示例#2
0
class DatasetVectorizer:
    def __init__(self, raw_sentence, model_dir, save_vocab=True):
        os.makedirs(model_dir, exist_ok=True)
        raw_sentence = raw_sentence.ravel()
        raw_sentence = [str(x) for x in list(raw_sentence)]
        self.sentence_length = [
            len(str(x).split(' ')) for x in list(raw_sentence)
        ]
        max_sentence_length = max(self.sentence_length)
        self.vocabulary = VocabularyProcessor(max_sentence_length)

        if save_vocab:
            self.vocabulary.save('{}/vocab'.format(model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence):
        #num_instances, num_classes = raw_sentence.shape
        num_instances = raw_sentence.shape[0]
        num_classes = 1
        raw_sentence = raw_sentence.ravel()

        for i, v in enumerate(raw_sentence):
            if v is np.nan:
                print(i, v)

        vectorized_sentence = np.array(
            list(self.vocabulary.transform(raw_sentence)))

        vectorized_sentence = vectorized_sentence.reshape(
            num_instances, num_classes, self.max_sentence_len)

        return vectorized_sentence[:, 0, :]
示例#3
0
        # 分词
        yield list(cut(text))


# 序列长度填充或截取到100,删除词频<=2的词
vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)

# 创建词汇表,创建后不能更改
vocab.fit(DOCUMENTS)

# 保存和加载词汇表
vocab.save('vocab.pickle')
vocab = VocabularyProcessor.restore('vocab.pickle')

# 文本转为词ID序列,未知或填充用的词ID为0
id_documents = list(vocab.transform(DOCUMENTS))
for id_document in id_documents:
    print(id_document)
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
class DatasetVectorizer:
    
    def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            if char_embeddings:
                log('Chosen char embeddings.')
                self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)]
            else:
                log('Chosen word embeddings.')
                self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)]
            max_sentence_length = max(self.sentences_lengths)
            log('Maximum sentence length : {}'.format(max_sentence_length))
            
            if char_embeddings:
                log('Processing sentences with char embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                    tokenizer_fn=char_tokenizer,
                )
            else:
                log('Processing sentences with word embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                )
            log('Sentences have been successfully processed.')
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))
    
    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length
    
    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)
    
    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
    
    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))
    
    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()
        
        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)
        
        vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs)))
        
        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes,
                                                                      self.max_sentence_len)
        
        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
class CNN(object):
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.number_of_classes = 2
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.max_words = None
        self.vocabProcessor = None
        self.cnn_model = None
        self.model = None
        self.test_x = []
        self.test_y = []

    def load_dataset_training(self,
                              vocab_name,
                              filename='datasetWithoutNeutral'):
        """ Load the dataset """
        X, Y = load_csv('datasets/' + filename,
                        target_column=2,
                        columns_to_ignore=[0])
        """ Count max words from the longest sentence """
        self.max_words = max([len(x[0].split(" ")) for x in X])
        """ Get vocabulare size from longest sentence """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        """ Encode pos, neu and neg to numbers """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(Y)
        Y = labelEncoder.transform(Y)
        """ Change the list of sentences to a list of sequence of words """
        X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                             for x in X])))
        """ Split the datasets to training set and test test """
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            X, Y, test_size=0.10, random_state=7)
        """ Pad the sequences to fit the longest sentence """
        self.X_train = pad_sequences(self.X_train,
                                     maxlen=self.max_words,
                                     value=0.)
        self.X_test = pad_sequences(self.X_test,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.Y_train = to_categorical(self.Y_train,
                                      nb_classes=self.number_of_classes)
        self.Y_test = to_categorical(self.Y_test,
                                     nb_classes=self.number_of_classes)
        self.vocabProcessor.save(vocab_name)

    def create_cnn_architecture_two_layers(
            self,
            model_name,
            outputDim=300,
            number_of_filters=60,
            filterSize=[3, 4],
            padding='same',
            activation_function_convLayer='relu',
            regularizer='L2',
            dropouts=0.5,
            activation_function_fc='softmax',
            optimizer='adam',
            learning_rate=0.001,
            loss_function='categorical_crossentropy'):
        if len(filterSize) == 0:
            filterSize = [3, 4]
        """ Define input shape and create word embedding """
        self.cnn_model = input_data(shape=[None, self.max_words], name='input')
        self.cnn_model = tflearn.embedding(
            self.cnn_model,
            input_dim=len(self.vocabProcessor.vocabulary_),
            output_dim=outputDim)
        """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
        conv1 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[0],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        conv2 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[1],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        #conv3 = conv_1d(cnn_model, nb_filter = 128,  filter_size = 5, padding = 'same',
        #                 activation = 'relu', regularizer = 'L2')
        self.cnn_model = merge([conv1, conv2], mode='concat', axis=1)
        """ Expand one dimension to fit the max_pooling layer """
        self.cnn_model = tf.expand_dims(self.cnn_model, 1)
        self.cnn_model = global_max_pool(self.cnn_model)
        """ Instantiate dropout layer and specify dropout parameter """
        self.cnn_model = dropout(self.cnn_model, dropouts)
        """ Instantiate fully connected layer and regression layer. """
        self.cnn_model = fully_connected(self.cnn_model,
                                         self.number_of_classes,
                                         activation=activation_function_fc)
        self.cnn_model = regression(self.cnn_model,
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    loss=loss_function,
                                    name='models/' + model_name)

    def train_and_save(self,
                       model_name,
                       tensorboard_verbose=0,
                       tensorboard_dir='/logs/',
                       nb_epochs=5,
                       shuffle=True,
                       show_metric=True):
        """ Instantiate Deep neural network model and start the training """
        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.fit(self.X_train,
                       self.Y_train,
                       n_epoch=nb_epochs,
                       validation_set=(self.X_test, self.Y_test),
                       shuffle=shuffle,
                       show_metric=show_metric,
                       batch_size=self.batch_size,
                       run_id=model_name)
        """ Save the model """
        self.model.save('models/' + model_name)

    def load_model(self,
                   model_name,
                   outputDim=300,
                   number_of_filters=60,
                   filterSize=[3, 4],
                   padding='same',
                   activation_function_convLayer='relu',
                   regularizer='L2',
                   dropouts=0.5,
                   activation_function_fc='softmax',
                   optimizer='adam',
                   learning_rate=0.001,
                   loss_function='categorical_crossentropy',
                   tensorboard_verbose=0,
                   tensorboard_dir='/logs/'):
        """
            Has to pass the same values that the models were trained with. If the
            model was trained on default values, the parameters will pass it automatically.
        """

        self.create_cnn_architecture_two_layers(
            model_name, outputDim, number_of_filters, filterSize, padding,
            activation_function_convLayer, regularizer, dropouts,
            activation_function_fc, optimizer, learning_rate, loss_function)

        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.load('models/' + model_name)

    def load_test_dataset(self,
                          filename='testDatasetWithOutNeuTwo',
                          vocab_name='vocabProc'):
        """
            Something is wrong with this function. Does not get the same result
            as before when loading in the new data...
        """
        """ Load test dataset """
        self.test_x, self.test_y = load_csv('datasets/' + filename,
                                            target_column=1)
        """ Get restored vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Encode pos, neu and neg to numbers  """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(self.test_y)
        self.test_y = labelEncoder.transform(self.test_y)
        """ Change the list of sentences to a list of sequence of words """
        self.test_x = np.array(
            list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
        """ Pad the sequences to fit the longest sentence """
        self.test_x = pad_sequences(self.test_x,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.test_y = to_categorical(self.test_y,
                                     nb_classes=self.number_of_classes)

    def evaluate_model_performance(self):
        metrix_score = self.model.evaluate(self.test_x,
                                           self.test_y,
                                           batch_size=self.batch_size)
        return metrix_score

    def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentence])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(sentence)
        return pred_score

    def predict_list(self,
                     list_of_sentences=[[''], ['']],
                     vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentece])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(list_of_sentences)
        return pred_score
示例#6
0
#test_x, test_y = load_csv('testDatasetWithNeuOne', target_column = 1)
#test_x, test_y = load_csv('testDatasetWithNeuTwo', target_column = 1)
#test_x, test_y = load_csv('testDatasetWithOutNeuOne', target_column = 1)
test_x, test_y = load_csv('testDatasetWithOutNeuTwo', target_column=1)
""" Count max words from the longest sentence """
#max_words = max([len(x[0].split(" ")) for x in test_x])
max_words = 2132
""" Get vocabulare size from longest sentence """
vocab = VocabularyProcessor(max_words)
vocab = vocab.restore('vocabProc')
""" Encode pos, neu and neg to numbers """
labelEncoder = LabelEncoder()
labelEncoder.fit(test_y)
test_y = labelEncoder.transform(test_y)
""" Change the list of sentences to a list of sequence of words """
test_x = np.array(list(vocab.transform([x[0] for x in test_x])))
""" Pad the sequences to fit the longest sentence """
test_x = pad_sequences(test_x, maxlen=max_words, value=0.)
""" Convert labels to binary vector """
test_y = to_categorical(test_y, nb_classes=2)
#test_y= to_categorical(test_y, nb_classes = 3)
""" Create the same neural network as the one that is going to be loaded. """
cnn_model = input_data(shape=[None, max_words], name='input')
cnn_model = tflearn.embedding(cnn_model,
                              input_dim=len(vocab.vocabulary_),
                              output_dim=300)
""" Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
conv1 = conv_1d(cnn_model,
                nb_filter=60,
                filter_size=3,
                padding='same',
示例#7
0
from tflearn.data_utils import VocabularyProcessor

vocab = {'hello':3, '.':5, 'world':20, '/' : 10}
sentences = ['hello world . / hello', 'hello']

vocab_processor = VocabularyProcessor(max_document_length=6, vocabulary=vocab)
encoded = list(vocab_processor.transform(sentences))
print(encoded)
示例#8
0
net = tflearn.regression(net) # adam, 0.001

model = tflearn.DNN(net, tensorboard_verbose=0)
model.load("model.tfl")

# use labels for output
ratings = [
    'Unbearable',  # 72
    'Disaster', # 4
    'Awful', # 664
    'Painful', # 340
    'Bad', # 1269
    'Mediocre', # 1959
    'Okay', # 2945
    'Good', # 4741
    'Great', # 4773
    'Amazing', # 1804
    'Masterpiece', # 55
]

# make a prediction for each parsed in argument, and fin the label for the output the model is most confident on
for title in games_to_predict:

    predicted = model.predict(list(word_processor.transform([title])))[0]
    max_value = max(predicted)
    max_index = predicted.index(max_value)
    # Code for debugging
    # print(predicted, max_value, max_index)

    print("I think "+title+" is a "+ratings[max_index].lower()+" game!")