예제 #1
0
def build_vocabulary(inFile, dtype, vocabFile):
    # 文本长度 200
    MAX_LENGTH = 300
    NB_CLASSES = 2

    # 读入分词后文本
    doc = MySentences(inFile, dtype, 'get_content')
    # 把原始文本映射到index
    processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5)
    processor.fit(doc)
    processor.save(vocabFile)
def make_vocab_processor(name,text,max_length,min_frequency):
    ''''
    generate vocab model
    '''
    print('Making vocabulary model...')
    vp = VocabularyProcessor(max_length, min_frequency=min_frequency)
    vp = vp.fit(text)
    if name == None:
        return vp
    else:
        print('Saving vocabulary model to {}'.format(name))
        vp.save(name)
        return vp
예제 #3
0
class DatasetVectorizer:
    def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            self.sentences_lengths = [
                len(str(x).split(' ')) for x in list(raw_sentence_pairs)
            ]
            max_sentence_length = max(self.sentences_lengths)
            self.vocabulary = VocabularyProcessor(max_sentence_length)
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()

        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)

        vectorized_sentence_pairs = np.array(
            list(self.vocabulary.transform(raw_sentence_pairs)))

        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(
            num_instances, num_classes, self.max_sentence_len)

        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
예제 #4
0
class DatasetVectorizer:
    def __init__(self, raw_sentence, model_dir, save_vocab=True):
        os.makedirs(model_dir, exist_ok=True)
        raw_sentence = raw_sentence.ravel()
        raw_sentence = [str(x) for x in list(raw_sentence)]
        self.sentence_length = [
            len(str(x).split(' ')) for x in list(raw_sentence)
        ]
        max_sentence_length = max(self.sentence_length)
        self.vocabulary = VocabularyProcessor(max_sentence_length)

        if save_vocab:
            self.vocabulary.save('{}/vocab'.format(model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence):
        #num_instances, num_classes = raw_sentence.shape
        num_instances = raw_sentence.shape[0]
        num_classes = 1
        raw_sentence = raw_sentence.ravel()

        for i, v in enumerate(raw_sentence):
            if v is np.nan:
                print(i, v)

        vectorized_sentence = np.array(
            list(self.vocabulary.transform(raw_sentence)))

        vectorized_sentence = vectorized_sentence.reshape(
            num_instances, num_classes, self.max_sentence_len)

        return vectorized_sentence[:, 0, :]
예제 #5
0
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))


# 序列长度填充或截取到100,删除词频<=2的词
vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)

# 创建词汇表,创建后不能更改
vocab.fit(DOCUMENTS)

# 保存和加载词汇表
vocab.save('vocab.pickle')
vocab = VocabularyProcessor.restore('vocab.pickle')

# 文本转为词ID序列,未知或填充用的词ID为0
id_documents = list(vocab.transform(DOCUMENTS))
for id_document in id_documents:
    print(id_document)
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
class DatasetVectorizer:
    
    def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            if char_embeddings:
                log('Chosen char embeddings.')
                self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)]
            else:
                log('Chosen word embeddings.')
                self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)]
            max_sentence_length = max(self.sentences_lengths)
            log('Maximum sentence length : {}'.format(max_sentence_length))
            
            if char_embeddings:
                log('Processing sentences with char embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                    tokenizer_fn=char_tokenizer,
                )
            else:
                log('Processing sentences with word embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                )
            log('Sentences have been successfully processed.')
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))
    
    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length
    
    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)
    
    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
    
    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))
    
    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()
        
        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)
        
        vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs)))
        
        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes,
                                                                      self.max_sentence_len)
        
        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
class CNN(object):
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.number_of_classes = 2
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.max_words = None
        self.vocabProcessor = None
        self.cnn_model = None
        self.model = None
        self.test_x = []
        self.test_y = []

    def load_dataset_training(self,
                              vocab_name,
                              filename='datasetWithoutNeutral'):
        """ Load the dataset """
        X, Y = load_csv('datasets/' + filename,
                        target_column=2,
                        columns_to_ignore=[0])
        """ Count max words from the longest sentence """
        self.max_words = max([len(x[0].split(" ")) for x in X])
        """ Get vocabulare size from longest sentence """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        """ Encode pos, neu and neg to numbers """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(Y)
        Y = labelEncoder.transform(Y)
        """ Change the list of sentences to a list of sequence of words """
        X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                             for x in X])))
        """ Split the datasets to training set and test test """
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            X, Y, test_size=0.10, random_state=7)
        """ Pad the sequences to fit the longest sentence """
        self.X_train = pad_sequences(self.X_train,
                                     maxlen=self.max_words,
                                     value=0.)
        self.X_test = pad_sequences(self.X_test,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.Y_train = to_categorical(self.Y_train,
                                      nb_classes=self.number_of_classes)
        self.Y_test = to_categorical(self.Y_test,
                                     nb_classes=self.number_of_classes)
        self.vocabProcessor.save(vocab_name)

    def create_cnn_architecture_two_layers(
            self,
            model_name,
            outputDim=300,
            number_of_filters=60,
            filterSize=[3, 4],
            padding='same',
            activation_function_convLayer='relu',
            regularizer='L2',
            dropouts=0.5,
            activation_function_fc='softmax',
            optimizer='adam',
            learning_rate=0.001,
            loss_function='categorical_crossentropy'):
        if len(filterSize) == 0:
            filterSize = [3, 4]
        """ Define input shape and create word embedding """
        self.cnn_model = input_data(shape=[None, self.max_words], name='input')
        self.cnn_model = tflearn.embedding(
            self.cnn_model,
            input_dim=len(self.vocabProcessor.vocabulary_),
            output_dim=outputDim)
        """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
        conv1 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[0],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        conv2 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[1],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        #conv3 = conv_1d(cnn_model, nb_filter = 128,  filter_size = 5, padding = 'same',
        #                 activation = 'relu', regularizer = 'L2')
        self.cnn_model = merge([conv1, conv2], mode='concat', axis=1)
        """ Expand one dimension to fit the max_pooling layer """
        self.cnn_model = tf.expand_dims(self.cnn_model, 1)
        self.cnn_model = global_max_pool(self.cnn_model)
        """ Instantiate dropout layer and specify dropout parameter """
        self.cnn_model = dropout(self.cnn_model, dropouts)
        """ Instantiate fully connected layer and regression layer. """
        self.cnn_model = fully_connected(self.cnn_model,
                                         self.number_of_classes,
                                         activation=activation_function_fc)
        self.cnn_model = regression(self.cnn_model,
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    loss=loss_function,
                                    name='models/' + model_name)

    def train_and_save(self,
                       model_name,
                       tensorboard_verbose=0,
                       tensorboard_dir='/logs/',
                       nb_epochs=5,
                       shuffle=True,
                       show_metric=True):
        """ Instantiate Deep neural network model and start the training """
        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.fit(self.X_train,
                       self.Y_train,
                       n_epoch=nb_epochs,
                       validation_set=(self.X_test, self.Y_test),
                       shuffle=shuffle,
                       show_metric=show_metric,
                       batch_size=self.batch_size,
                       run_id=model_name)
        """ Save the model """
        self.model.save('models/' + model_name)

    def load_model(self,
                   model_name,
                   outputDim=300,
                   number_of_filters=60,
                   filterSize=[3, 4],
                   padding='same',
                   activation_function_convLayer='relu',
                   regularizer='L2',
                   dropouts=0.5,
                   activation_function_fc='softmax',
                   optimizer='adam',
                   learning_rate=0.001,
                   loss_function='categorical_crossentropy',
                   tensorboard_verbose=0,
                   tensorboard_dir='/logs/'):
        """
            Has to pass the same values that the models were trained with. If the
            model was trained on default values, the parameters will pass it automatically.
        """

        self.create_cnn_architecture_two_layers(
            model_name, outputDim, number_of_filters, filterSize, padding,
            activation_function_convLayer, regularizer, dropouts,
            activation_function_fc, optimizer, learning_rate, loss_function)

        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.load('models/' + model_name)

    def load_test_dataset(self,
                          filename='testDatasetWithOutNeuTwo',
                          vocab_name='vocabProc'):
        """
            Something is wrong with this function. Does not get the same result
            as before when loading in the new data...
        """
        """ Load test dataset """
        self.test_x, self.test_y = load_csv('datasets/' + filename,
                                            target_column=1)
        """ Get restored vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Encode pos, neu and neg to numbers  """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(self.test_y)
        self.test_y = labelEncoder.transform(self.test_y)
        """ Change the list of sentences to a list of sequence of words """
        self.test_x = np.array(
            list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
        """ Pad the sequences to fit the longest sentence """
        self.test_x = pad_sequences(self.test_x,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.test_y = to_categorical(self.test_y,
                                     nb_classes=self.number_of_classes)

    def evaluate_model_performance(self):
        metrix_score = self.model.evaluate(self.test_x,
                                           self.test_y,
                                           batch_size=self.batch_size)
        return metrix_score

    def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentence])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(sentence)
        return pred_score

    def predict_list(self,
                     list_of_sentences=[[''], ['']],
                     vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentece])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(list_of_sentences)
        return pred_score
예제 #8
0
labelEncoder.fit(Y)
Y = labelEncoder.transform(Y)
""" Change the list of sentences to a list of sequence of words """
X = np.array(list(vocab.fit_transform([x[0] for x in X])))
""" Split the datasets to training set and test test """
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.10,
                                                    random_state=7)
""" Pad the sequences to fit the longest sentence """
X_train = pad_sequences(X_train, maxlen=max_words, value=0.)
X_test = pad_sequences(X_test, maxlen=max_words, value=0.)
""" Convert labels to binary vector """
Y_train = to_categorical(Y_train, nb_classes=2)
Y_test = to_categorical(Y_test, nb_classes=2)
vocab.save('vocabProc')
""" 
    Begin the creation of convolutional model
"""
""" Define input shape and create word embedding """
cnn_model = input_data(shape=[None, max_words], name='input')
cnn_model = tflearn.embedding(cnn_model,
                              input_dim=len(vocab.vocabulary_),
                              output_dim=300)
""" Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
conv1 = conv_1d(cnn_model,
                nb_filter=60,
                filter_size=3,
                padding='same',
                activation='relu',
                regularizer='L2')
예제 #9
0
    'Masterpiece', # 55
]

# select title and score lable from data and use it for traning
x = data['title']
# game titles is transformed to lists of 15 numbers (one to one word)
# use nltk for better transformation and predictions
word_processor = VocabularyProcessor(15)
x = np.array(list(word_processor.fit_transform(x)))

y = []
for label in data['score_phrase']:
    y.append(ratings.index(label))

# save the trained word model to use it in the predict program
word_processor.save("wordprocessor")



# find number of items in each category
def statistics():
    numbersInCategory = [0] * len(ratings)
    for labelNo in y:
        numbersInCategory[labelNo] += 1
    return numbersInCategory


# EXPERIMENT
# dublicate game titles in rating categories whith few examples
# in my understandig of the math, well known examples (categories whith many examples) will have a stonger
#   weight and others down bacause of the partial derivative