예제 #1
0
    def tokenize(self, writeDictionaryToCsv=False):
        print('Tokenizing')
        print('adding n-grams')
        self.trainData = self.addNgGrams(self.trainData)
        self.testData = self.addNgGrams(self.testData)

        all_reviews = self.trainData.append(self.testData)
        tokenizer = Tokenizer(num_words=30000)
        print('fitting')
        tokenizer.fit_on_texts(all_reviews)
        tokenizer.fit_on_sequences(all_reviews)

        print('texts_to_sequences')
        self.trainData = tokenizer.texts_to_sequences(self.trainData)
        self.testData = tokenizer.texts_to_sequences(self.testData)
        print('sequences_to_matrix')
        self.trainData = tokenizer.sequences_to_matrix(self.trainData)
        self.testData = tokenizer.sequences_to_matrix(self.testData)

        all_reviews = np.vstack((self.trainData, self.testData))

        #does not work with svm no time to change code
        self.trainData = self.trainData / self.trainData.sum(axis=1)[:, None]
        self.testData = self.testData / self.testData.sum(axis=1)[:, None]

        if (writeDictionaryToCsv):
            self.ExportFeatureSpace(tokenizer)

        print('Finished tokenizing')
예제 #2
0
    def closure(mu):
        (x_train, y_train), (_, _) = imdb.load_data()
        tokenizer = Tokenizer(num_words=5000)
        tokenizer.fit_on_sequences(x_train)
        x_train = tokenizer.sequences_to_matrix(x_train, "tfidf")
        # Note: svd_solver=full is needed on GPU server
        x_train = PCA(n_components=100,
                      svd_solver='full').fit_transform(x_train)
        ds = {"data": x_train, "target": y_train}

        # Apply noise and return
        res = preprocess_and_noise(dataset=ds, mu=mu)
        return res