def fit(self, X, y=None): # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # PCA is recommended to use on the standardized data (zero mean and # unit variance). if self.standardization: self.scaler_ = StandardScaler().fit(X) X = self.scaler_.transform(X) self.detector_ = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) self.detector_.fit(X=X, y=y) # copy the attributes from the sklearn PCA object self.n_components_ = self.detector_.n_components_ self.components_ = self.detector_.components_ # validate the number of components to be used for outlier detection if self.n_selected_components is None: self.n_selected_components_ = self.n_components_ else: self.n_selected_components_ = self.n_selected_components check_parameter(self.n_selected_components_, 1, self.n_components_, include_left=True, include_right=True, param_name='n_selected_components_') # use eigenvalues as the weights of eigenvectors self.w_components_ = np.ones([ self.n_components_, ]) if self.weighted: self.w_components_ = self.detector_.explained_variance_ratio_ # outlier scores is the sum of the weighted distances between each # sample to the eigenvectors. The eigenvectors with smaller # eigenvalues have more influence # Not all eigenvectors are used, only n_selected_components_ smallest # are used since they better reflect the variance change self.selected_components_ = self.components_[ -1 * self.n_selected_components_:, :] self.selected_w_components_ = self.w_components_[ -1 * self.n_selected_components_:] self.decision_scores_ = np.sum(cdist(X, self.selected_components_) / self.selected_w_components_, axis=1).ravel() self._process_decision_scores() return self
def fit(self, X, y=None): """ Fit the model using X as training data. :param X: Training data. If array or matrix, shape [n_samples, n_features], or [n_samples, n_samples] if metric='precomputed'. :type X: {array-like, sparse matrix, BallTree, KDTree} :return: self :rtype: object """ # Validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) self.detector_ = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) self.detector_.fit(X=X, y=y) # self.decision_scores_ = # self._process_decision_scores() return self
def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ nsamples, nx, ny = X.shape X = X.reshape((nsamples, nx * ny)) self.detector_ = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) self.detector_.fit(X=X, y=y) # copy the attributes from the sklearn PCA object self.n_components_ = self.detector_.n_components_ self.components_ = self.detector_.components_ # validate the number of components to be used for outlier detection if self.n_selected_components is None: self.n_selected_components_ = self.n_components_ else: self.n_selected_components_ = self.n_selected_components self.w_components_ = np.ones([ self.n_components_, ]) if self.weighted: self.w_components_ = self.detector_.explained_variance_ratio_ # outlier scores is the sum of the weighted distances between each # sample to the eigenvectors. The eigenvectors with smaller # eigenvalues have more influence # Not all eigenvectors are used, only n_selected_components_ smallest # are used since they better reflect the variance change self.selected_components_ = self.components_[ -1 * self.n_selected_components_:, :] self.selected_w_components_ = self.w_components_[ -1 * self.n_selected_components_:] self.decision_scores_ = np.sum(cdist(X, self.selected_components_) / self.selected_w_components_, axis=1).ravel() self._process_decision_scores() return self
def analysisPCA(cryo_data, normalize=True): ### Get results on my own PCA on this dataset new_data = PCA(cryo_data, normalize=normalize) plotResults_2D(new_data, cryo_data.iloc[:,-1], 'Custom PCA Results on cryo Dataset - Normalized = '+str(normalize)) ### Get results to compare to using the sklearn version of PCA on this dataset pca = sklearn_PCA(n_components=2) if normalize: sklearn_data = sklearn_SS().fit_transform(cryo_data.iloc[:,:-1]) sklearn_new_data = pca.fit_transform(sklearn_data) else: sklearn_new_data = pca.fit_transform(cryo_data.iloc[:,:-1]) plotResults_2D(pd.DataFrame(sklearn_new_data), cryo_data.iloc[:,-1], 'Sklearn PCA Results on cryo Dataset - Normalized = '+str(normalize))
def train(self, data, labels=None): # stacking input data data = np.vstack(data) # checking to make sure that enough components are specified if data.shape[1] < self.n_components: self.logger.warning("more components specified than features!") self.logger.warning("truncating n_components({}) to num_features({})"\ .format(self.n_components,data.shape[1])) # reinstantiating the class with fewer components # this is done to make sure that self.io_map is accurate self.__init__(data.shape[1], self.random_state) self.pca = sklearn_PCA(self.n_components) self.pca.fit(data)
def fit(self, X): self.X = X # store X in the output sample_ids = X.index feature_ids = X.columns X = X.as_matrix() nuee_PCA = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) nuee_PCA.fit(X) ordi_column_names = [ 'PCA%d' % (i + 1) for i in range(nuee_PCA.explained_variance_.shape[0]) ] # prepare output eigenvalues = nuee_PCA.explained_variance_ p_explained = pd.Series(eigenvalues / eigenvalues.sum(), index=ordi_column_names) if self.scaling == 1: sample_scores = nuee_PCA.transform(X) biplot_scores = nuee_PCA.components_.T elif self.scaling == 2 or scaling == 'correlation': sample_scores = nuee_PCA.transform(X).dot( np.diag(eigenvalues**(-0.5))) biplot_scores = nuee_PCA.components_.dot(np.diag( eigenvalues**0.5)).T # Add PCA ordination object names to self self.ordiobject_type = 'PCA' self.method_name = 'Principal Components Analysis' self.ordi_fitted = nuee_PCA self.eigenvalues = eigenvalues self.proportion_explained = p_explained self.sample_scores = pd.DataFrame(sample_scores, index=sample_ids, columns=ordi_column_names) self.sample_scores.index.name = 'ID' self.biplot_scores = pd.DataFrame(biplot_scores, index=feature_ids, columns=ordi_column_names) self.biplot_scores.index.name = 'ID' return self
def sklearn_reduceData(data, dim): # Requires sklearn_PCA to be imported model = sklearn_PCA(dim, svd_solver="full") reduced = model.fit_transform(data) return reduced, model.components_.T, model.mean_
def generate_meta_features(X): """Get the meta-features of a datasets X Parameters ---------- X : numpy array of shape (n_samples, n_features) Input array Returns ------- meta_features : numpy array of shape (1, 200) Meta-feature in dimension of 200 """ # outliers_fraction = np.count_nonzero(y) / len(y) # outliers_percentage = round(outliers_fraction * 100, ndigits=4) X = check_array(X) meta_vec = [] meta_vec_names = [] # on the sample level n_samples, n_features = X.shape[0], X.shape[1] meta_vec.append(n_samples) meta_vec.append(n_features) meta_vec_names.append('n_samples') meta_vec_names.append('n_features') sample_mean = np.mean(X) sample_median = np.median(X) sample_var = np.var(X) sample_min = np.min(X) sample_max = np.max(X) sample_std = np.std(X) q1, q25, q75, q99 = np.percentile(X, [0.01, 0.25, 0.75, 0.99]) iqr = q75 - q25 normalized_mean = sample_mean / sample_max normalized_median = sample_median / sample_max sample_range = sample_max - sample_min sample_gini = gini(X) med_abs_dev = np.median(np.absolute(X - sample_median)) avg_abs_dev = np.mean(np.absolute(X - sample_mean)) quant_coeff_disp = (q75 - q25) / (q75 + q25) coeff_var = sample_var / sample_mean outliers_15iqr = np.logical_or(X < (q25 - 1.5 * iqr), X > (q75 + 1.5 * iqr)) outliers_3iqr = np.logical_or(X < (q25 - 3 * iqr), X > (q75 + 3 * iqr)) outliers_1_99 = np.logical_or(X < q1, X > q99) outliers_3std = np.logical_or(X < (sample_mean - 3 * sample_std), X > (sample_mean + 3 * sample_std)) percent_outliers_15iqr = np.sum(outliers_15iqr) / len(X) percent_outliers_3iqr = np.sum(outliers_3iqr) / len(X) percent_outliers_1_99 = np.sum(outliers_1_99) / len(X) percent_outliers_3std = np.sum(outliers_3std) / len(X) has_outliers_15iqr = np.any(outliers_15iqr).astype(int) has_outliers_3iqr = np.any(outliers_3iqr).astype(int) has_outliers_1_99 = np.any(outliers_1_99).astype(int) has_outliers_3std = np.any(outliers_3std).astype(int) meta_vec.extend([ sample_mean, sample_median, sample_var, sample_min, sample_max, sample_std, q1, q25, q75, q99, iqr, normalized_mean, normalized_median, sample_range, sample_gini, med_abs_dev, avg_abs_dev, quant_coeff_disp, coeff_var, # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, percent_outliers_15iqr, percent_outliers_3iqr, percent_outliers_1_99, percent_outliers_3std, has_outliers_15iqr, has_outliers_3iqr, has_outliers_1_99, has_outliers_3std ]) meta_vec_names.extend([ 'sample_mean', 'sample_median', 'sample_var', 'sample_min', 'sample_max', 'sample_std', 'q1', 'q25', 'q75', 'q99', 'iqr', 'normalized_mean', 'normalized_median', 'sample_range', 'sample_gini', 'med_abs_dev', 'avg_abs_dev', 'quant_coeff_disp', 'coeff_var', # moment_5, moment_6, moment_7, moment_8, moment_9, moment_10, 'percent_outliers_15iqr', 'percent_outliers_3iqr', 'percent_outliers_1_99', 'percent_outliers_3std', 'has_outliers_15iqr', 'has_outliers_3iqr', 'has_outliers_1_99', 'has_outliers_3std' ]) ########################################################################### normality_k2, normality_p = normaltest(X) is_normal_5 = (normality_p < 0.05).astype(int) is_normal_1 = (normality_p < 0.01).astype(int) meta_vec.extend(list_process(normality_p)) meta_vec.extend(list_process(is_normal_5)) meta_vec.extend(list_process(is_normal_1)) meta_vec_names.extend(list_process_name('normality_p')) meta_vec_names.extend(list_process_name('is_normal_5')) meta_vec_names.extend(list_process_name('is_normal_1')) moment_5 = moment(X, moment=5) moment_6 = moment(X, moment=6) moment_7 = moment(X, moment=7) moment_8 = moment(X, moment=8) moment_9 = moment(X, moment=9) moment_10 = moment(X, moment=10) meta_vec.extend(list_process(moment_5)) meta_vec.extend(list_process(moment_6)) meta_vec.extend(list_process(moment_7)) meta_vec.extend(list_process(moment_8)) meta_vec.extend(list_process(moment_9)) meta_vec.extend(list_process(moment_10)) meta_vec_names.extend(list_process_name('moment_5')) meta_vec_names.extend(list_process_name('moment_6')) meta_vec_names.extend(list_process_name('moment_7')) meta_vec_names.extend(list_process_name('moment_8')) meta_vec_names.extend(list_process_name('moment_9')) meta_vec_names.extend(list_process_name('moment_10')) # note: this is for each dimension == the number of dimensions skewness_list = skew(X).reshape(-1, 1) skew_values = list_process(skewness_list) meta_vec.extend(skew_values) meta_vec_names.extend(list_process_name('skewness')) # note: this is for each dimension == the number of dimensions kurtosis_list = kurtosis(X) kurtosis_values = list_process(kurtosis_list) meta_vec.extend(kurtosis_values) meta_vec_names.extend(list_process_name('kurtosis')) correlation = np.nan_to_num(pd.DataFrame(X).corr(), nan=0) correlation_list = flatten_diagonally(correlation)[0:int( (n_features * n_features - n_features) / 2)] correlation_values = list_process(correlation_list) meta_vec.extend(correlation_values) meta_vec_names.extend(list_process_name('correlation')) covariance = np.cov(X.T) covariance_list = flatten_diagonally(covariance)[0:int( (n_features * n_features - n_features) / 2)] covariance_values = list_process(covariance_list) meta_vec.extend(covariance_values) meta_vec_names.extend(list_process_name('covariance')) # sparsity rep_counts = [] for i in range(n_features): rep_counts.append(len(np.unique(X[:, i]))) sparsity_list = np.asarray(rep_counts) / (n_samples) sparsity = list_process(sparsity_list) meta_vec.extend(sparsity) meta_vec_names.extend(list_process_name('sparsity')) # ANOVA p value p_values_list = [] all_perm = list(itertools.combinations(list(range(n_features)), 2)) for j in all_perm: p_values_list.append(f_oneway(X[:, j[0]], X[:, j[1]])[1]) anova_p_value = list_process(np.asarray(p_values_list)) # anova_p_value = np.mean(p_values_list) # anova_p_value_exceed_thresh = np.mean((np.asarray(p_values_list)<0.05).astype(int)) meta_vec.extend(anova_p_value) meta_vec_names.extend(list_process_name('anova_p_value')) # pca pca_transformer = sklearn_PCA(n_components=3) X_transform = pca_transformer.fit_transform(X) # first pc pca_fpc = list_process(X_transform[0, :], r_min=False, r_max=False, r_mean=False, r_std=True, r_skew=True, r_kurtosis=True) meta_vec.extend(pca_fpc) meta_vec_names.extend( ['first_pca_std', 'first_pca_skewness', 'first_pca_kurtosis']) # entropy entropy_list = [] for i in range(n_features): counts = pd.Series(X[:, i]).value_counts() entropy_list.append(entropy(counts) / n_samples) entropy_values = list_process(entropy_list) meta_vec.extend(entropy_values) meta_vec_names.extend(list_process_name('entropy')) ##############################Landmarkers###################################### # HBOS clf = HBOS(n_bins=10) clf.fit(X) HBOS_hists = clf.hist_ HBOS_mean = np.mean(HBOS_hists, axis=0) HBOS_max = np.max(HBOS_hists, axis=0) HBOS_min = np.min(HBOS_hists, axis=0) meta_vec.extend(list_process(HBOS_mean)) meta_vec.extend(list_process(HBOS_max)) meta_vec.extend(list_process(HBOS_min)) meta_vec_names.extend(list_process_name('HBOS_mean')) meta_vec_names.extend(list_process_name('HBOS_max')) meta_vec_names.extend(list_process_name('HBOS_min')) # IForest n_estimators = 100 clf = IForest(n_estimators=n_estimators) clf.fit(X) n_leaves = [] n_depth = [] fi_mean = [] fi_max = [] # doing this for each sub-trees for i in range(n_estimators): n_leaves.append(clf.estimators_[i].get_n_leaves()) n_depth.append(clf.estimators_[i].get_depth()) fi_mean.append(clf.estimators_[i].feature_importances_.mean()) fi_max.append(clf.estimators_[i].feature_importances_.max()) # print(clf.estimators_[i].tree_) meta_vec.extend(list_process(n_leaves)) meta_vec.extend(list_process(n_depth)) meta_vec.extend(list_process(fi_mean)) meta_vec.extend(list_process(fi_max)) meta_vec_names.extend(list_process_name('IForest_n_leaves')) meta_vec_names.extend(list_process_name('IForest_n_depth')) meta_vec_names.extend(list_process_name('IForest_fi_mean')) meta_vec_names.extend(list_process_name('IForest_fi_max')) # PCA clf = PCA(n_components=3) clf.fit(X) meta_vec.extend(clf.explained_variance_ratio_) meta_vec.extend(clf.singular_values_) meta_vec_names.extend( ['pca_expl_ratio_1', 'pca_expl_ratio_2', 'pca_expl_ratio_3']) meta_vec_names.extend(['pca_sv_1', 'pca_sv_2', 'pca_sv_3']) # LODA n_bins = 10 n_random_cuts = 100 n_hists_mean = [] n_hists_max = [] n_cuts_mean = [] n_cuts_max = [] clf = LODA(n_bins=n_bins, n_random_cuts=n_random_cuts) clf.fit(X) for i in range(n_bins): n_hists_mean.append(clf.histograms_[:, i].mean()) n_hists_max.append(clf.histograms_[:, i].max()) for i in range(n_random_cuts): n_cuts_mean.append(clf.histograms_[i, :].mean()) n_cuts_max.append(clf.histograms_[i, :].max()) meta_vec.extend(list_process(n_hists_mean)) meta_vec.extend(list_process(n_hists_max)) meta_vec.extend(list_process(n_cuts_mean)) meta_vec.extend(list_process(n_cuts_max)) meta_vec_names.extend(list_process_name('LODA_n_hists_mean')) meta_vec_names.extend(list_process_name('LODA_n_hists_max')) meta_vec_names.extend(list_process_name('LODA_n_cuts_mean')) meta_vec_names.extend(list_process_name('LODA_n_cuts_max')) return meta_vec, meta_vec_names
def __init__(self): # self.classifier = sklearn_PCA(n_components='mle') self.classifier = sklearn_PCA( n_components=900) #This is for gabor filter only!!!
def __init__(self, n_components=4, whiten=True, name="PCA4Feature"): super(PCA4Feature, self).__init__(name) self.n_components = n_components self.whiten = whiten self.model = sklearn_PCA(n_components=self.n_components, whiten=self.whiten)
def fit(self, X, y=None): """Fit detector. y is ignored in unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : Ignored Not used, present for API consistency by convention. Returns ------- self : object Fitted estimator. """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # PCA is recommended to use on the standardized data (zero mean and # unit variance). if self.standardization: X, self.scaler_ = standardizer(X, keep_scalar=True) self.detector_ = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) self.detector_.fit(X=X, y=y) # copy the attributes from the sklearn PCA object self.n_components_ = self.detector_.n_components_ self.components_ = self.detector_.components_ # validate the number of components to be used for outlier detection if self.n_selected_components is None: self.n_selected_components_ = self.n_components_ else: self.n_selected_components_ = self.n_selected_components check_parameter(self.n_selected_components_, 1, self.n_components_, include_left=True, include_right=True, param_name='n_selected_components_') # use eigenvalues as the weights of eigenvectors self.w_components_ = np.ones([ self.n_components_, ]) if self.weighted: self.w_components_ = self.detector_.explained_variance_ratio_ # outlier scores is the sum of the weighted distances between each # sample to the eigenvectors. The eigenvectors with smaller # eigenvalues have more influence # Not all eigenvectors are used, only n_selected_components_ smallest # are used since they better reflect the variance change self.selected_components_ = self.components_[ -1 * self.n_selected_components_:, :] self.selected_w_components_ = self.w_components_[ -1 * self.n_selected_components_:] self.decision_scores_ = np.sum(cdist(X, self.selected_components_) / self.selected_w_components_, axis=1).ravel() self._process_decision_scores() return self
def __init__(self): # self.classifier = sklearn_PCA(n_components='mle') self.classifier = sklearn_PCA(n_components=900) #This is for gabor filter only!!!
from sklearn import datasets from sklearn.decomposition import PCA as sklearn_PCA import matplotlib.pyplot as plt iris = datasets.load_iris() #from mpl_toolkits.mplot3d import Axes3D import numpy as np X = iris.data Y = iris.target target_names = iris.target_names np.random.seed(5) skPCA = sklearn_PCA(n_components=2) X_learn = skPCA.fit(X).transform(X) plt.figure() colours =['navy', 'turquoise', 'darkorange'] lw = 2 for color, i , target_name in zip(colours, [0,1,2], target_names): plt.scatter(X_learn[Y ==i, 0], X_learn[Y ==i,1], color = color, alpha =.8, lw=lw, label = target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of IRIS dataset') plt.show()
def fit(self, X, y=None): """Fit detector. y is optional for unsupervised methods. Parameters ---------- X : numpy array of shape (n_samples, n_features) The input samples. y : numpy array of shape (n_samples,), optional (default=None) The ground truth of the input samples (labels). """ # validate inputs X and y (optional) X = check_array(X) self._set_n_classes(y) # PCA is recommended to use on the standardized data (zero mean and # unit variance). if self.standardization: X, self.scaler_ = standardizer(X, keep_scalar=True) self.detector_ = sklearn_PCA(n_components=self.n_components, copy=self.copy, whiten=self.whiten, svd_solver=self.svd_solver, tol=self.tol, iterated_power=self.iterated_power, random_state=self.random_state) self.detector_.fit(X=X, y=y) # copy the attributes from the sklearn PCA object self.n_components_ = self.detector_.n_components_ self.components_ = self.detector_.components_ # validate the number of components to be used for outlier detection if self.n_selected_components is None: self.n_selected_components_ = self.n_components_ else: self.n_selected_components_ = self.n_selected_components check_parameter(self.n_selected_components_, 1, self.n_components_, include_left=True, include_right=True, param_name='n_selected_components_') # use eigenvalues as the weights of eigenvectors self.w_components_ = np.ones([self.n_components_, ]) if self.weighted: self.w_components_ = self.detector_.explained_variance_ratio_ # outlier scores is the sum of the weighted distances between each # sample to the eigenvectors. The eigenvectors with smaller # eigenvalues have more influence # Not all eigenvectors are used, only n_selected_components_ smallest # are used since they better reflect the variance change self.selected_components_ = self.components_[ -1 * self.n_selected_components_:, :] self.selected_w_components_ = self.w_components_[ -1 * self.n_selected_components_:] self.decision_scores_ = np.sum( cdist(X, self.selected_components_) / self.selected_w_components_, axis=1).ravel() self._process_decision_scores() return self