def show_clusters(data, y, name, params=None): model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) if params is not None: model.set_params(**params) X = model.fit_transform(data) print X p = model.get_params() print "X.shape = ", X.shape print "y.shape = ", y.shape print y plt.scatter(X[:,0], X[:,1], c=y) plt.gray() plt.axis('off') plt.show() plt.savefig("ClustersUntrained{}.png".format(name), dpi=600) plt.clf() return p
class TSNERepresentation(Representation): @staticmethod def default_config(): default_config = Representation.default_config() # parameters default_config.parameters = Dict() default_config.parameters.perplexity = 30.0 default_config.parameters.init = "random" default_config.parameters.random_state = None return default_config def __init__(self, n_features=28 * 28, n_latents=10, config={}, **kwargs): Representation.__init__(self, config=config, **kwargs) # input size (flatten) self.n_features = n_features # latent size self.n_latents = n_latents # feature range self.feature_range = (0.0, 1.0) self.algorithm = TSNE(n_components=self.n_latents) self.update_algorithm_parameters() def fit(self, X_train, update_range=True): ''' X_train: array-like (n_samples, n_features) ''' X_train = np.nan_to_num(X_train) if update_range: self.feature_range = (X_train.min(axis=0), X_train.max(axis=0)) # save (min, max) for normalization X_train = (X_train - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) self.algorithm.fit(X_train) def calc_embedding(self, x): x = (x - self.feature_range[0]) / (self.feature_range[1] - self.feature_range[0]) x = self.algorithm.transform(x) return x def update_algorithm_parameters(self): self.algorithm.set_params(**self.config.parameters, verbose=False)
def dimensionality_reduction(TrainFeatures, TestFeatures, Method, params): """ It performs dimensionality reduction of a training and a test features matrix stored in a .h5 file each. It's possible to use 5 different methods for dimensionality reduction. _____________________________________________________________________________________ Parameters: - TrainFeatures: string It is the path of an .h5 file of the training features. It contains at least the following datasets: - 'feats': array-like, shape (n_samples, n_features) - 'labels': array-like, shape (n_samples, ) - 'img_ids': array-like, shape (n_samples, ) - TestFeatures: string It is the path of an .h5 file of the test features. It contains at least the same datasets. - Method: string Possible value are: -'PCA': Principal component analysis -'t-SNE': t-distributed Stochastic Neighbor Embedding -'TruncatedSVD': Truncated SVD -'NMF': Non-Negative Matrix Factorization -'LDA': Linear Discriminant Analysis - params: dict It is a dictionary containig parameters for the selected estimator. Keys and possible values are listed on the following websites: http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html http://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html For t-SNE, an additional key is needed: params['reduce'] with possible values 'TruncatedSVD','PCA','None'. It is highly recommended to use another dimensionality reduction method (e.g. PCA for dense data or TruncatedSVD for sparse data) to reduce the number of dimensions to a reasonable amount (e.g. 50) if the number of features is very high. This will suppress some noise and speed up the computation of pairwise distances between samples. - params['reduce']='TruncatedSVD' : Truncated SVD --> t-SNE - params['reduce']='PCA' : PCA --> t-SNE - params['reduce']='None' : t-SNE directly Returns: - X_train: array-like, shape (n_samples, n_components) - X_test: array-like, shape (n_samples, n_components) - ax: matplotlib.axes._subplots.AxesSubplot object (if n_components<=3) or None (if n_components>3) Furthermore, automatically 2 new .h5 files containing 3 datasets each (one for reduced features, one for labels and one for img_ids) are generated in the folder Results/ReducedFeatures and also if n_components is <= 3 a scatter plot is saved in the folder Results/Plots Example usage: import FeaturesReduction as fr import matplotlib.pyplot as plt params={'n_components':3} X_train,X_test,ax=fr.dimensionality_reduction('TrainingFeatures.h5','TestFeatures.h5','PCA',params) plt.show() """ s = os.sep # Load training features file train = h5py.File(TrainFeatures, 'r') train_features = train['feats'] train_labels = train['labels'] train_labels = np.squeeze(train_labels) train_img_ids = train['img_id'] # Get categories of the training set from features ids categories = mf.get_categories(train_img_ids) # Load test features file test = h5py.File(TestFeatures, 'r') test_features = test['feats'] test_labels = test['labels'] test_labels = np.squeeze(test_labels) test_img_ids = test['img_id'] n_comp = params['n_components'] if Method != 'NMF': # Standardize features by removing the mean and scaling to unit variance scaler = StandardScaler().fit(train_features) train_features = scaler.transform(train_features) test_features = scaler.transform(test_features) if Method == 'PCA': # Get PCA model pca = PCA() # Set parameters pca.set_params(**params) # Fit the model with the training features and # apply dimensional reduction to training features X_train = pca.fit_transform(train_features) # Apply dimensional reduction to test features X_test = pca.transform(test_features) elif Method == 'NMF': params['verbose'] = True # Get NMF model nmf = NMF() # Set parameters nmf.set_params(**params) # Fit the model with the training features and # apply dimensional reduction to training features X_train = nmf.fit_transform(train_features) # Apply dimensional reduction to test features X_test = nmf.transform(test_features) elif Method == 'LDA': # Get LDA model lda = LDA() # Set parameters lda.set_params(**params) # Fit the model with the training features #lda.fit(train_features,train_labels) # apply dimensional reduction to training features #X_train = lda.transform(train_features) X_train = lda.fit_transform(train_features, train_labels) # apply dimensional reduction to training features #X_train = lda.transform(train_features) # Apply dimensional reduction to test features X_test = lda.transform(test_features) elif Method == 't-SNE': red = params['reduce'] del params['reduce'] print(red) params['verbose'] = True # Use another dimensionality reduction method (PCA for dense # data or TruncatedSVD for sparse data) to reduce the number of # dimensions to a reasonable amount (e.g. 50) if the number of # features is very high. This will suppress some noise and speed # up the computation of pairwise distances between samples. if n_comp < 50: K = 50 else: K = n_comp * 2 if red == 'TruncatedSVD': # Get TruncatedSVD model svd = TruncatedSVD(n_components=K) # Fit the model with the training features and # apply dimensional reduction to training features train_features = svd.fit_transform(train_features) # Apply dimensional reduction to test features test_features = svd.transform(test_features) elif red == 'PCA': # Get PCA model pca = PCA(n_components=K) # Fit the model with the training features and # apply dimensional reduction to training features train_features = pca.fit_transform(train_features) # Apply dimensional reduction to test features test_features = pca.transform(test_features) else: pass # Get t-SNE model tsne = TSNE() # Set parameters tsne.set_params(**params) # Concatenate training and test set n_train = train_features.shape[0] features = np.concatenate((train_features, test_features), axis=0) # Fit the model with the data and apply dimensional reduction X = tsne.fit_transform(features) # Separate training and test set X_train = X[:n_train, :] X_test = X[n_train:, :] elif Method == 'TruncatedSVD': # Get TruncatedSVD model svd = TruncatedSVD() # Set parameters svd.set_params(**params) # Fit the model with the training features and # apply dimensional reduction to training features X_train = svd.fit_transform(train_features) # Apply dimensional reduction to test features X_test = svd.transform(test_features) else: raise TypeError( "Invalid method: possible methods are 'PCA', 't-SNE', 'TruncatedSVD', 'NMF' and 'LDA'" ) # Create folder in which save reduced features mf.folders_creator('Results', ['ReducedFeatures']) # Create an .h5 file and store in it reduced training set name = 'Results' + s + 'ReducedFeatures' + s + Method + str( n_comp) + '_' + TrainFeatures.split(s)[-1].split('.')[0] + '.h5' f = h5py.File(name, "w") f.create_dataset('img_id', data=train_img_ids[:], dtype="S40") f.create_dataset('labels', data=train_labels.T, compression="gzip") if Method == 'PCA': f.create_dataset('pca', data=X_train.T, compression="gzip") elif Method == 't-SNE': f.create_dataset('tsne', data=X_train.T, compression="gzip") elif Method == 'TruncatedSVD': f.create_dataset('tsvd', data=X_train.T, compression="gzip") elif Method == 'LDA': f.create_dataset('lda', data=X_train.T, compression="gzip") elif Method == 'NMF': f.create_dataset('nmf', data=X_train.T, compression="gzip") f.close() # Create an .h5 file and store in it reduced test set name = 'Results' + s + 'ReducedFeatures' + s + Method + str( n_comp) + '_' + TestFeatures.split(s)[-1].split('.')[0] + '.h5' f = h5py.File(name, "w") f.create_dataset('img_id', data=test_img_ids[:], dtype="S40") f.create_dataset('labels', data=test_labels.T, compression="gzip") if Method == 'PCA': f.create_dataset('pca', data=X_test.T, compression="gzip") elif Method == 't-SNE': f.create_dataset('tsne', data=X_test.T, compression="gzip") elif Method == 'TruncatedSVD': f.create_dataset('tsvd', data=X_test.T, compression="gzip") elif Method == 'LDA': f.create_dataset('lda', data=X_test.T, compression="gzip") elif Method == 'NMF': f.create_dataset('nmf', data=X_test.T, compression="gzip") f.close() if n_comp < 4: # Get folders list of the test set from features ids test_folders = mf.get_categories(test_img_ids) # Get number of folders n_folders_test = len(test_folders) # Make some names for the plot legend tf = [] for i in range(n_folders_test): tf.append('Test' + str(i)) # Define a list of colors in exadecimal format if len(categories) + n_folders_test < 9: colors = [ '#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#00FFFF', '#808080', '#FF00FF', '#000000' ] else: n = 250 max_value = 255**3 interval = int(max_value / n) colors = [ '#' + hex(i)[2:].zfill(6) for i in range(0, max_value, interval) ] colors = colors[:int((n + 1) / 10 * 9)] random.shuffle(colors) # Create a folder to save images mf.folders_creator('Results', ['Plots']) # Create a name to save image name = Method + str(n_comp) + '_' + TrainFeatures.split(s)[-1].split( '.')[0] name = name.split('_') name = '_'.join(name[:-1]) print(X_train.shape) print(X_test.shape) if n_comp == 1: # Plot 1D Data with different colors fig, ax = plt.subplots() for i in range(len(categories)): ax.scatter(X_train[train_labels == i, 0], np.ones(X_train[train_labels == i, 0].shape), c=colors[i], label=categories[i]) k = len(categories) for i in range(n_folders_test): ax.scatter(X_test[test_labels == i, 0], np.ones(X_test[test_labels == i, 0].shape), c=colors[k], label=tf[i]) k += 1 ax.legend() # Save image in .png format plt.savefig('Results' + s + 'Plots' + s + name + '.png') if n_comp == 2: # Plot 2D Data with different colors fig, ax = plt.subplots() for i in range(len(categories)): ax.scatter(X_train[train_labels == i, 0], X_train[train_labels == i, 1], c=colors[i], label=categories[i]) k = len(categories) for i in range(n_folders_test): ax.scatter(X_test[test_labels == i, 0], X_test[test_labels == i, 1], c=colors[k], label=tf[i]) k += 1 ax.legend() # Save image in .png format plt.savefig('Results' + s + 'Plots' + s + name + '.png') # Remove outliers out_train = mf.is_outlier(X_train, thresh=3.5) out_test = mf.is_outlier(X_test, thresh=3.5) out_train = np.logical_not(out_train) out_test = np.logical_not(out_test) X_train2 = X_train[out_train, :] X_test2 = X_test[out_test, :] if X_train2.shape[0] != X_train.shape[0] or X_test2.shape[ 0] != X_test.shape[0]: train_labels2 = train_labels[out_train] test_labels2 = test_labels[out_test] # Plot 2D Data without outliers with different colors fig, ax = plt.subplots() for i in range(len(categories)): ax.scatter(X_train2[train_labels2 == i, 0], X_train2[train_labels2 == i, 1], c=colors[i], label=categories[i]) k = len(categories) for i in range(n_folders_test): ax.scatter(X_test2[test_labels2 == i, 0], X_test2[test_labels2 == i, 1], c=colors[k], label=tf[i]) k += 1 ax.legend() # Save image in .png format plt.savefig('Results' + s + 'Plots' + s + name + '_noOutliers.png') if n_comp == 3: mf.folders_creator('Results' + s + 'Plots', ['tmp']) # Plot 3-D Data with different colors ax = plt.subplot(111, projection='3d') for i in range(len(categories)): ax.scatter(X_train[train_labels == i, 0], X_train[train_labels == i, 1], X_train[train_labels == i, 2], c=colors[i], label=categories[i]) k = len(categories) for i in range(n_folders_test): ax.scatter(X_test[test_labels == i, 0], X_test[test_labels == i, 1], X_test[test_labels == i, 2], c=colors[k], label=tf[i]) k += 1 ax.legend(loc='upper left', numpoints=1, ncol=3, fontsize=8, bbox_to_anchor=(0, 0)) # Rotate for 360° and save every 10° for angle in range(0, 360, 10): ax.view_init(30, angle) plt.savefig('Results' + s + 'Plots' + s + 'tmp' + s + name + str(angle) + '.png') # Save as a .gif image mf.imagesfolder_to_gif('Results' + s + 'Plots' + s + name + '.gif', 'Results' + s + 'Plots' + s + 'tmp', 0.2) shutil.rmtree('Results' + s + 'Plots' + s + 'tmp') else: ax = None return X_train, X_test, ax
with open(pkl_filename, 'wb') as file: pickle.dump(model, file) # Load from file with open(pkl_filename, 'rb') as file: pickle_model = pickle.load(file) ################################################# ############# Ensemble ############# # bagging - changing seed & averaging n_bags = 10 seed = 1 bagged_pred = np.zeros(test.shape[0]) for i in range(n_bags): model.set_params(random_state = seed + i) model.fit(tr_X, y) preds = model.predict(te) bagged_pred += preds bagged_pred /= n_bags # stacking # prediction on the valid set valid_pred_1 = model_1.predict(valid) valid_pred_2 = model_2.predict(valid) stacked_valid = np.column_stack((valid_pred_1, valid_pred_2)) # prediction on the test set te_pred_1 = model_1.predict(te) te_pred_2 = model_2.predict(te) stacked_te = np.column_stack((te_pred_1, te_pred_2))
class Kmeans_fine_grained(): def __init__(self, axis, action_name, data_category): self.axis = axis self.action_name = action_name self.data_category = data_category # if data_category == 'train': # fusion_features_path = f'src/fine_grained_features/conv1d_2d_features/{data_category}_features_{axis}_{action_name}.npy' # self.fusion_features = np.load(fusion_features_path) # else: # fusion_features_path = f'src/fine_grained_features/conv1d_2d_features/test_features_{axis}_{action_name}.npy' # self.fusion_features = np.load(fusion_features_path) fusion_features_path = f'src/fine_grained_features/conv1d_2d_features/{data_category}_features_{axis}_{action_name}.npy' self.fusion_features = np.load(fusion_features_path) self.Tsne = TSNE(n_components=3, init='pca', random_state=0) self.Kmeans = KMeans(n_clusters=7, random_state=0) def get_tsne_data(self): """ 获取降维后的节点数据 :return: """ print( f'======== 获取tsne降维数据 {self.data_category}_{self.axis}_{self.action_name} =============' ) data = [] targets = [] for fusion_feature in self.fusion_features: # 动作数据 fusion_feature = fusion_feature.reshape(-1, int(self.axis[0]), 36) # (7, 6, 36) for label, feature in enumerate(fusion_feature): feature = feature.flatten() targets.append(label) data.append(feature) data = np.array(data) targets = np.array(targets) print(data.shape) if self.data_category == 'train': tsne_fit = self.Tsne.fit(data) tsne_params = tsne_fit.get_params() np.save( f'src/fine_grained_features/tsne_model/tsne_params_{self.axis}.npy', tsne_params) else: tsne_params_value = np.load( f'src/fine_grained_features/tsne_model/tsne_params_{self.axis}.npy', allow_pickle=True).item() self.Tsne.set_params(**tsne_params_value) tsne_data = self.Tsne.fit_transform(data) # self.matplotlib(tsne_data) tsne_data_targets = [] for index, tsne in enumerate(tsne_data): data_target = np.append(tsne, targets[index]) tsne_data_targets.append(data_target.tolist()) tsne_data_targets = np.array(tsne_data_targets) print(tsne_data_targets.shape) print( f'{self.data_category}_tsne_data_{self.axis}_{self.action_name} shape:{tsne_data_targets.shape}' ) np.save( f'src/fine_grained_features/tsne_data/{self.data_category}_tsne_data_{self.axis}_{self.action_name}.npy', tsne_data_targets) def train_kmeans(self): data_targets_path = f'src/fine_grained_features/tsne_data/train_tsne_data_{self.axis}_{self.action_name}.npy' data_targets = np.load(data_targets_path) tsne_data = data_targets[:, :3] tsne_targets = data_targets[:, 3] tsne_data = minmax_scale(tsne_data) kmeans_model = self.Kmeans.fit(tsne_data) joblib.dump( kmeans_model, f'src/fine_grained_features/kmeans_model/kmeans_model_{self.axis}_{self.action_name}.pkl' ) kmeans_cluster = kmeans_model.cluster_centers_ # 聚类的核心 predicted = kmeans_model.predict(tsne_data) kmeans_cluster_label_dict = {} # 保存聚类后的簇心,和标签值 # 聚类结果 kmeans_data = np.c_[tsne_data, predicted] # 排列标签 labels = np.zeros_like(predicted) for i in range(7): mask = (predicted == i) labels[mask] = mode(tsne_targets[mask])[0] kmeans_cluster_label = int(mode(tsne_targets[mask])[0][0]) kmeans_cluster_label_dict[ f'sensor-{kmeans_cluster_label}'] = kmeans_cluster[i] # print(kmeans_cluster_label_dict) np.save( f'src/fine_grained_features/cluster_label_dict/cluster_label_dict_{self.axis}_{self.action_name}.npy', kmeans_cluster_label_dict) np.save( f'src/fine_grained_features/kmeans_data/{self.data_category}_kmeans_data_{self.axis}_{self.action_name}.npy', kmeans_data) # 计算准确度 accuracy = accuracy_score(tsne_targets, labels) print(f'train_{self.axis}_{self.action_name} accuracy:{accuracy}') def predict_kmeans(self): data_targets_path = f'src/fine_grained_features/tsne_data/test_tsne_data_{self.axis}_{self.action_name}.npy' data_targets = np.load(data_targets_path) tsne_data = data_targets[:, :3] tsne_targets = data_targets[:, 3] tsne_data = minmax_scale(tsne_data) kmeans_model = joblib.load( f'src/fine_grained_features/kmeans_model/kmeans_model_{self.axis}_{self.action_name}.pkl' ) predicted = kmeans_model.predict(tsne_data) # 聚类结果 kmeans_data = np.c_[tsne_data, predicted] # 排列标签 labels = np.zeros_like(predicted) for i in range(7): mask = (predicted == i) labels[mask] = mode(tsne_targets[mask])[0] # 计算准确度 accuracy = accuracy_score(tsne_targets, labels) print(f'test_{self.axis}_{self.action_name} accuracy:{accuracy}') np.save( f'src/fine_grained_features/kmeans_data/{self.data_category}_kmeans_data_{self.axis}_{self.action_name}.npy', kmeans_data)
class LORAS(BaseOverSampler): """Localized Random Affine Shadowsampling (LoRAS). This class implements the LoRAS oversampling technique for imbalanced datasets. This technique generates Gaussian noise in small neighborhoods around the minority class samples and then the finaly synthetic samples are obtained by a convex combination of multiple noisy data points (shadowsamples). Parameters ---------- {sampling_strategy} n_neighbors : int or estimator object, default=None If ``int``, number of nearest neighbours to used to construct synthetic samples. If object, an estimator that inherits from :class:`~sklearn.neighbors.base.KNeighborsMixin` that will be used to find the k_neighbors. n_shadow : int, default=None The number of shadow samples to generate per minority class data point. std : float or sequence, default=0.005 The standard deviation of the Normal distribution to add to each feature when generating shadow samples. If the input is a sequence, its size must be equal to the number of features of ``X`` when calling the ``fit_resample`` method. If ``float``, then same standard deviation will be used for all shadow samples generated. n_affine : int, default=None The number of shadow samples to use when generating the synthetic samples through random affine combinations. If given, the value must be between ``2`` and the number of features used in the fitting data. If not given, the value will be set to the total number of features in fitting data. manifold_learner : object, default=None An instance of an object that to perform a 2-dimensional embedding of a dataset. It must implement the scikit-learn Estimator interface, ``fit_transform`` and ``set_params`` methods must be implemented. If not given, the :class:`~sklearn.manifold.TSNE` class is used to obtain the 2d manifold of the data. Defaults to None. manifold_learner_params : dict, default=None A dictionary of additional parameters to pass to the instance of the ``manifold_learner`` (or TSNE if ``manifold_learner`` is None) when creating a 2D manifold of the fitting data. The keys are the parameter names and the values are the values. If not given, the default values are used. {random_state} {n_jobs} References ---------- .. [1] Bej, S., Davtyan, N., Wolfien, M. et al. LoRAS: an oversampling approach for imbalanced datasets. Mach Learn 110, 279–301 (2021). https://doi.org/10.1007/s10994-020-05913-4 Examples -------- >>> from pyloras import LORAS >>> from sklearn.datasets import make_classification >>> from collections import Counter >>> l = LORAS() >>> X, y = make_classification(n_classes=3, class_sep=3, ... weights=[0.7, 0.2, 0.1], n_informative=3, n_redundant=1, flip_y=0, ... n_features=20, n_clusters_per_class=1, n_samples=2000, random_state=10) >>> print('Original dataset shape %s' % Counter(y)) Original dataset shape Counter({{1: 400, 2: 200, 0: 1400}}) >>> X_res, y_res = l.fit_resample(X, y) >>> print(f"Resampled dataset shape % Counter(y_res)) Resampled dataset shape Counter({{1: 1400, 2: 1400, 0: 1400}}) """ def __init__(self, *, sampling_strategy="auto", n_neighbors=None, n_shadow=None, std=0.005, n_affine=None, manifold_learner=None, manifold_learner_params=None, random_state=None, n_jobs=None): super().__init__(sampling_strategy=sampling_strategy) self.n_neighbors = n_neighbors self.n_shadow = n_shadow self.std = std self.n_affine = n_affine self.manifold_learner = manifold_learner self.manifold_learner_params = manifold_learner_params self.random_state = random_state self.n_jobs = n_jobs def _check_2d_manifold_learner(self): if (not hasattr(self.manifold_learner, "fit_transform") or not hasattr(self.manifold_learner, "set_params")): raise ValueError( "The 2d manifold learner must implement the ``fit_transform`` " "and ``set_params`` methods") return clone(self.manifold_learner) def _initialize_params(self, X, y, rng): """Initialize the parameter values to their appropriate values.""" f_size = X.shape[1] self.n_affine_ = f_size if self.n_affine is None else self.n_affine if self.manifold_learner: self.manifold_learner_ = self._check_2d_manifold_learner() else: self.manifold_learner_ = TSNE(n_components=2) if self.manifold_learner_params is not None: self.manifold_learner_.set_params(**self.manifold_learner_params) try: self.manifold_learner_.set_params( random_state=safe_random_state(rng)) except ValueError: pass _, y_counts = np.unique(y, return_counts=True) if self.n_neighbors is None: n_neighbors = 30 if y_counts.min() >= 100 else 5 else: n_neighbors = self.n_neighbors self.nn_ = check_neighbors_object("n_neighbors", n_neighbors) if self.n_jobs is not None: self.nn_.set_params(n_jobs=self.n_jobs) if self.n_shadow is None: self.n_shadow_ = max(ceil(2 * f_size / self.nn_.n_neighbors), 40) else: self.n_shadow_ = self.n_shadow if self.n_affine_ >= self.nn_.n_neighbors * self.n_shadow_: raise ValueError( "The number of shadow samples used to create an affine random " "combination must be less than `n_neighbors * n_shadow`.") try: iter(self.std) self.std_ = self.std except TypeError: self.std_ = [self.std] * f_size def _fit_resample(self, X, y): random_state = check_random_state(self.random_state) self._initialize_params(X, y, random_state) n_features = X.shape[1] X_res = [X.copy()] y_res = [y.copy()] dirichlet_param = [1] * self.n_affine_ loras_samples = defaultdict(lambda: []) for minority_class, samples_to_make in self.sampling_strategy_.items(): if samples_to_make == 0: continue X_minority = X[y == minority_class] X_embedded = self.manifold_learner_.fit_transform(X_minority) self.nn_.fit(X_embedded) neighborhoods = self.nn_.kneighbors(X_embedded, return_distance=False) num_loras = ceil(samples_to_make / X_embedded.shape[0]) for neighbor_group in neighborhoods: shadow_sample_size = (self.n_shadow_, self.nn_.n_neighbors, n_features) total_shadow_samples = ( X_minority[neighbor_group] + random_state.normal( scale=self.std_, size=shadow_sample_size)).reshape( self.n_shadow_ * self.nn_.n_neighbors, n_features) random_index = random_state.integers( 0, total_shadow_samples.shape[0], size=(num_loras, self.n_affine_)) weights = random_state.dirichlet(dirichlet_param, size=num_loras) loras_samples[minority_class].append( (weights[:, None] @ total_shadow_samples[random_index]).reshape( num_loras, n_features)) # keep only ``samples_to_make`` synthetic samples from the generated. samples_to_drop = X_embedded.shape[0] * num_loras - samples_to_make random_state.shuffle(loras_samples[minority_class]) X_res.append( np.concatenate( loras_samples[minority_class])[samples_to_drop:]) y_res.append([minority_class] * samples_to_make) return np.concatenate(X_res), np.concatenate(y_res)