Exemplo n.º 1
0
def create_model_pca(arg, devices_list, eval=False, inverse=False):
    from models import PCA
    resume_dataset = arg.eval_dataset_pca if eval else arg.dataset
    resume_split = arg.eval_split_pca if eval else arg.split
    resume_split_source = arg.eval_split_source_pca if eval else arg.split_source

    pca = PCA(in_size=2 * kp_num[arg.dataset], pca_size=arg.pca_components)

    suffix = '_pca_inverse' if inverse else '_pca'

    load_path_first = arg.resume_folder + resume_dataset + '_' + resume_split + '+' + resume_split_source + suffix + '.pth'
    load_path_second = arg.resume_folder + resume_dataset + '_' + resume_split_source + '+' + resume_split + suffix + '.pth'

    if os.path.exists(load_path_first):
        print('Loading PCA from ' + load_path_first)
        pca = load_weights(pca, load_path_first, devices_list[0])

    if os.path.exists(load_path_second):
        print('Loading PCA from ' + load_path_second)
        pca = load_weights(pca, load_path_second, devices_list[0])

    if arg.cuda:
        pca = pca.cuda(device=devices_list[0])

    return pca
Exemplo n.º 2
0
def save(result, method):
    if method == "Z-score":
        Zscore.save(result)
    elif method == "PCA":
        PCA.save(result)
    else:
        autoencoder.save(result)
Exemplo n.º 3
0
 def run(self):
     if (self.modeltype == "Z-score"):
         return Zscore.run(self)
     elif (self.modeltype == "PCA"):
         return PCA.run(self)
     else:
         return autoencoder.run(self)
Exemplo n.º 4
0
def build_pca(y, q):
    """
    Quick wrapper to build a PCA model instead of a GP-LVM
    :param y: data ("high dimension" n_xi x n_s)
    :param q: dimensions (int)
    :return:
    """
    return PCA(y, q, rowvar=False)
    def perform_kpca_lda_grid_search(self, kernel_kpca, gamma_kpca,
                                     number_of_components_kpca, solver_lda,
                                     shrinkage_lda):
        acc_train_kpca_lda = {}
        acc_test_kpca_lda = {}
        experiment_number = len(kernel_kpca) * len(gamma_kpca) * len(number_of_components_kpca) * \
                            len(solver_lda) * len(shrinkage_lda)
        if "svd" in solver_lda:
            experiment_number = len(kernel_kpca) * len(gamma_kpca) * len(number_of_components_kpca) * \
                                ((len(solver_lda) - 1) * len(shrinkage_lda) + 1)
        progress_bar = tqdm(total=experiment_number,
                            desc='Grid searching for best kpca+lda ')
        for kernel in kernel_kpca:
            acc_train_kpca_lda[kernel] = []
            acc_test_kpca_lda[kernel] = []
            for gamma in gamma_kpca:
                for com_num in number_of_components_kpca:
                    reduced_train_data, reduced_validation_data = PCA.KPCA_fun(
                        self.train_data.copy(),
                        self.validation_data.copy(),
                        kernel=kernel,
                        gamma=gamma,
                        components=com_num)
                    log_message = "Used dimensionality reduction, via kernel PCA with kernel: {}, \t gamma: {}, " \
                                  "\t number of components: {}\n".format(kernel, gamma, com_num)
                    util.logger(log_message,
                                self.log_folder,
                                change_classifier=False)
                    for solver in solver_lda:
                        if solver == "svd":
                            print("HEY")

                            acc1, acc2 = LDA.lda_classifier(
                                reduced_train_data.copy(),
                                reduced_validation_data.copy(), solver)
                            log_message = ("LDA solver: {}\n".format(solver))
                            log_message = log_message + (
                                "Training accuracy: {},\t Validation accuracy: {}\n"
                                .format(acc1, acc2))
                            util.logger(log_message, self.log_folder)
                        else:
                            for shrinkage in shrinkage_lda:
                                print("HEY2")
                                acc1, acc2 = LDA.lda_classifier(
                                    reduced_train_data.copy(),
                                    reduced_validation_data.copy(), solver,
                                    shrinkage)
                                log_message = (
                                    "LDA solver: {} \t shrinkage: {} \n".
                                    format(solver, shrinkage))
                                log_message = log_message + (
                                    "Training accuracy: {},\t Validation accuracy: {}\n"
                                    .format(acc1, acc2))
                                util.logger(log_message, self.log_folder)
                        acc_train_kpca_lda[kernel].append(acc1)
                        acc_test_kpca_lda[kernel].append(acc2)
                        progress_bar.update(1)
Exemplo n.º 6
0
def main_pca_to_net(arg):
    list = get_annotations_list(arg.dataset_route,
                                arg.dataset,
                                arg.split,
                                arg.crop_size,
                                ispdb=arg.PDB)
    list_src = get_annotations_list(arg.dataset_route,
                                    arg.dataset,
                                    arg.split_source,
                                    arg.crop_size,
                                    ispdb=arg.PDB)

    shapes, aligned_shapes = init_aligned_shapes(arg.dataset, list,
                                                 arg.crop_size)
    shapes_src, aligned_shapes_src = init_aligned_shapes(
        arg.dataset, list_src, arg.crop_size)

    shapes = np.concatenate((shapes, shapes_src), axis=0)

    pca = PCA(n_components=arg.pca_components, svd_solver='full')
    pca.fit(shapes)

    model_pca = PCAModule(2 * kp_num[arg.dataset], arg.pca_components)
    model_pca.load_parameters(pca.components_, pca.mean_)

    model_pca_inv = PCAModule(2 * kp_num[arg.dataset], arg.pca_components)
    model_pca_inv.load_parameters(pca.components_, pca.mean_, inverse=True)

    pose_params = pca.transform(shapes)
    pose_params_model = model_pca(torch.FloatTensor(shapes)).detach().numpy()

    inv_shapes = pca.inverse_transform(pose_params)
    inv_model_shapes = model_pca_inv(
        torch.FloatTensor(pose_params_model)).detach().numpy()

    pass
    def train(self):
        # todo should create a wrapper to remove this if else
        if self.dataset == "CIFAR10":
            dataloader = CIFAR_10.cifar_dataloader(
                self.train_path,
                self.validation_percentage,
                i_normalize=self.normalize_data,
                i_reduced_training_dataset=self.reduced_training_dataset,
                i_raw_images=self.load_raw_images)
            self.train_data, self.test_data, self.validation_data, self.classes = dataloader.get_cifar_10(
            )
            self.svm_type = "classification"
        elif self.dataset == "IMDB_WIKI" and not self.bin_ages:
            dataloader = IMDB_Wiki.imdb_wiki_dataloader(
                self.train_path,
                self.validation_percentage,
                i_normalize=self.normalize_data,
                i_reduced_training_dataset=self.reduced_training_dataset,
                i_raw_images=self.load_raw_images,
                i_feature_type=self.feature_extraction)
            self.train_data, self.test_data, self.validation_data = dataloader.get_imdb_wiki(
            )
            self.svm_type = "regression"
        elif self.dataset == "IMDB_WIKI" and self.bin_ages:
            dataloader = IMDB_Wiki.imdb_wiki_dataloader(
                self.train_path,
                self.validation_percentage,
                i_normalize=self.normalize_data,
                i_reduced_training_dataset=self.reduced_training_dataset,
                i_raw_images=self.load_raw_images,
                i_feature_type=self.feature_extraction,
                bin_ages=True)
            self.train_data, self.test_data, self.validation_data = dataloader.get_imdb_wiki(
            )
            self.svm_type = "classification"
        elif self.dataset == "MNIST":
            dataloader = MNIST.MNIST_dataloader(
                i_normalize=self.normalize_data,
                i_reduced_training_dataset=self.reduced_training_dataset,
                i_raw_images=self.load_raw_images)
            self.train_data, self.test_data, self.validation_data, self.classes = dataloader.get_MNIST(
            )
            self.svm_type = "classification"

        else:
            print("Selected dataset: {} is not implemented".format(
                self.dataset))
            raise NotImplementedError
            return
        if self.feature_extraction != "off":
            train_feature, validation_feature = feature_extraction.get_features(
                self.train_data,
                self.validation_data,
                feature_type=self.feature_extraction,
                feature_layer=self.feature_layer)
            self.train_data["data"] = np.stack(train_feature, axis=0)
            self.validation_data["data"] = np.stack(validation_feature, axis=0)
        if self.dimentionality_reduction == "PCA":
            self.train_data, self.validation_data = PCA.PCA_fun(
                self.train_data, self.validation_data)
        elif self.dimentionality_reduction == "KPCA":
            log_message = "Used dimensionality reduction, via kernel PCA with kernel {}\n".format(
                "rbf")
            util.logger(log_message, self.log_folder, change_classifier=False)
            self.train_data, self.validation_data = PCA.KPCA_fun(
                self.train_data, self.validation_data)
        elif self.dimentionality_reduction == "Isomap" or self.dimentionality_reduction == "LLE" \
                or self.dimentionality_reduction == "TSNE" or self.dimentionality_reduction == "modified_LLE"\
                or self.dimentionality_reduction == "hessian_LLE" or self.dimentionality_reduction == "laplacian_eigenmaps":
            log_message = "Used dimensionality reduction, method: {}\n".format(
                self.dimentionality_reduction)
            util.logger(log_message, self.log_folder, change_classifier=False)
            self.train_data, self.validation_data = spectral_graph_analysis.spectral_embedding(
                train=self.train_data,
                val=self.validation_data,
                classes=self.classes,
                method=self.dimentionality_reduction,
                plot_folder=self.plot_folder,
                neighbors=self.number_of_neighbors)
        else:
            if self.dimentionality_reduction != "off":
                print(
                    "Selected dimensionality reduction: {} is not implemented".
                    format(self.dimentionality_reduction))
                raise NotImplementedError
                return
        if self.grid_search:
            if self.classifier_type == "svm":
                c_svm = [0.01, 0.1, 1, 10, 100]
                kernel_svm = ['linear', 'poly', 'rbf', 'sigmoid']
                self.perform_svm_grid_search(c_svm, kernel_svm)
            elif self.classifier_type == "lda":
                self.perform_lda_grid_search()
            elif self.classifier_type == "kpca_lda":
                kernel_kpca = ['rbf']  # ['poly', 'rbf', 'sigmoid']
                gamma_kpca = [None]  # [None, 0.01, 0.1, 1]
                number_of_components_kpca = [
                    None
                ]  # [None,10,20,30,40, 50,60, 70,80,90, 100, 150, 200] # [None,10,20,30,40,50]#
                solver_lda = ['svd']  # ['svd', 'lsqr', 'eigen']
                shrinkage_lda = ['auto']  # ['auto', 0, 1, 0.01]
                self.perform_kpca_lda_grid_search(kernel_kpca, gamma_kpca,
                                                  number_of_components_kpca,
                                                  solver_lda, shrinkage_lda)
            elif self.classifier_type == "nearest_neighbor":
                k = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
                self.perform_nearest_neightbor_grid_search(k)
            elif self.classifier_type == "nearest_centroid":
                acc1, acc2 = nearest_neigh.nearest_centroid_classifier(
                    self.train_data, self.validation_data)
                log_message = ("Neareset Centroid:")
                log_message = log_message + (
                    "Training accuracy: {},\t Validation accuracy: {}\n".
                    format(acc1, acc2))
                util.logger(log_message, self.log_folder)
        else:
            if self.classifier_type == "kmeans" or self.classifier_type == "spectral_clustering":
                # for now only clustering is used in a non-grid search way
                accuracies = clustering.cluster(train=self.train_data,
                                                val=self.validation_data,
                                                type=self.classifier_type,
                                                number_of_clusters=10,
                                                plot_folder=self.plot_folder,
                                                classes=self.classes)
                log_message = "Dimensionality reduction: {},\t Clustering: {}\n".format(
                    self.dimentionality_reduction, self.classifier_type)
                for key in accuracies.keys():
                    log_message = log_message + "Metric: {} : {}\n".format(
                        key, accuracies[key])
                util.logger(log_message, self.log_folder)
            elif self.classifier_type == "kmeans_spectral":
                types = ["kmeans", "spectral_clustering"]
                for type in types:
                    accuracies = clustering.cluster(
                        train=self.train_data,
                        val=self.validation_data,
                        type=type,
                        number_of_clusters=10,
                        plot_folder=self.plot_folder,
                        classes=self.classes)
                    log_message = "Dimensionality reduction: {},\t Clustering: {}\n".format(
                        self.dimentionality_reduction, type)
                    for key in accuracies.keys():
                        log_message = log_message + "Metric: {} : {}\n".format(
                            key, accuracies[key])
                    util.logger(log_message, self.log_folder)
Exemplo n.º 8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Shilin He'

from models import PCA as PCA
from utils import data_loader as data_loader

para = {
'path':'../../Data/SOSP_data/', # directory for input data
'log_seq_file_name':'rm_repeat_rawTFVector.txt', # filename for log sequence data file
'label_file_name':'rm_repeat_mlabel.txt', # filename for label data file
'fraction':0.95
}


if __name__ == '__main__':
	raw_data, label_data = data_loader.hdfs_data_loader(para)
	weigh_data = PCA.weighting(raw_data)
	threshold, C = PCA.get_threshold(para, weigh_data)
	PCA.anomaly_detection(weigh_data, label_data, C, threshold)


Exemplo n.º 9
0
import sys
sys.path.append('../')
from models import PCA as PCA
from utils import data_loader as data_loader

para = {
    'path': '../../Data/BGL_data/',  # directory for input data
    'log_file_name': 'BGL_MERGED.log',  # filename for log data file
    'log_event_mapping':
    'logTemplateMap.csv',  # filename for log-event mapping. A list of event index, where each row represents a log
    'save_path':
    '../time_windows/',  # dir for saving sliding window data files to avoid splitting
    'select_column': [
        0, 4
    ],  # select the corresponding columns (label and time) in the raw log file
    'window_size': 3,  # time window (unit: hour)
    'step_size': 1,  # step size (unit: hour)
    'tf-idf':
    False,  # tf-idf should set to false in BGL data since it can get better accuracy
    'fraction': 0.95
}

if __name__ == '__main__':
    raw_data, event_mapping_data = data_loader.bgl_data_loader(para)
    event_count_matrix, labels = data_loader.bgl_preprocess_data(
        para, raw_data, event_mapping_data)
    weigh_data = PCA.weighting(para, event_count_matrix)
    threshold, C = PCA.get_threshold(para, weigh_data)
    PCA.anomaly_detection(weigh_data, labels, C, threshold)
Exemplo n.º 10
0
@author: Paris
"""

import numpy as np
import matplotlib.pyplot as plt

import sklearn.datasets

from models import PCA

if __name__ == "__main__":

    X = sklearn.datasets.load_iris().data
    y = sklearn.datasets.load_iris().target

    Z_dim = 2
    model = PCA(X, Z_dim)

    model.fit()

    Z = model.encode(X)
    X_star = model.decode(Z)

    error = np.linalg.norm(X - X_star, 2) / np.linalg.norm(X, 2)

    # Plot the projection onto the first two principal components
    plt.figure(1)
    plt.scatter(Z[:, 0], Z[:, 1], c=y)
    plt.xlabel('$z_1$')
    plt.ylabel('$z_2$')
Exemplo n.º 11
0
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
__author__ = 'Shilin He'

from models import PCA as PCA
from utils import data_loader as data_loader

para={
'path':'../../Data/BGL_data/', # directory for input data
'log_file_name':'BGL_MERGED.log', # filename for log data file
'log_event_mapping':'logTemplateMap.csv', # filename for log-event mapping. A list of event index, where each row represents a log
'save_path': '../time_windows/', # dir for saving sliding window data files to avoid splitting
'select_column':[0,4], # select the corresponding columns (label and time) in the raw log file
'window_size':3,  # time window (unit: hour)
'step_size': 1,  # step size (unit: hour)
'tf-idf': False, # tf-idf should set to false in BGL data since it can get better accuracy
'fraction':0.95
}

if __name__ == '__main__':
	raw_data, event_mapping_data = data_loader.bgl_data_loader(para)
	event_count_matrix, labels = data_loader.bgl_preprocess_data(para, raw_data, event_mapping_data)
	weigh_data = PCA.weighting(para, event_count_matrix)
	threshold, C = PCA.get_threshold(para, weigh_data)
	PCA.anomaly_detection(weigh_data, labels, C, threshold)


Exemplo n.º 12
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'Shilin He'

from models import PCA as PCA
from utils import data_loader as data_loader

para = {
    'path': '../../Data/SOSP_data/',  # directory for input data
    'log_seq_file_name':
    'rm_repeat_rawTFVector.txt',  # filename for log sequence data file
    'label_file_name': 'rm_repeat_mlabel.txt',  # filename for label data file
    'fraction': 0.95
}

if __name__ == '__main__':
    raw_data, label_data = data_loader.hdfs_data_loader(para)
    weigh_data = PCA.weighting(raw_data)
    threshold, C = PCA.get_threshold(para, weigh_data)
    PCA.anomaly_detection(weigh_data, label_data, C, threshold)