def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) agglo_mean.fit(X) agglo_median.fit(X) assert_true(np.size(np.unique(agglo_mean.labels_)) == n_clusters) assert_true(np.size(np.unique(agglo_median.labels_)) == n_clusters) assert_true(np.size(agglo_mean.labels_) == X.shape[1]) assert_true(np.size(agglo_median.labels_) == X.shape[1]) # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert_true(Xt_mean.shape[1] == n_clusters) assert_true(Xt_median.shape[1] == n_clusters) assert_true(Xt_mean == np.array([1 / 3.])) assert_true(Xt_median == np.array([0.])) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert_true(np.unique(X_full_mean[0]).size == n_clusters) assert_true(np.unique(X_full_median[0]).size == n_clusters) assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def FeatureSelection(df,numeric_cols , corrCoefThres=0.9): numeric_cols = numeric_cols numdf = df[numeric_cols] r_in_x = numdf.corr() r_in_x = abs(r_in_x) distance_in_x = 1 / r_in_x for i in range(r_in_x.shape[0]): distance_in_x.iloc[i, i] = 10 ^ 10 cpdist = distance_in_x.copy() cpdist = cpdist.fillna(cpdist.max().max()) #df.isna().sum() from scipy.spatial.distance import correlation from sklearn.cluster import FeatureAgglomeration corrcoefmin = corrCoefThres fa = FeatureAgglomeration(n_clusters=None,affinity="precomputed",compute_full_tree=True, linkage="average" ,distance_threshold=1/corrcoefmin) fa.fit(cpdist) numdf.shape[1] fa.n_clusters_ fadf = pd.DataFrame({"feature":numdf.columns.values , "label":fa.labels_}) selectedFeatures = fadf.groupby("label").head(1)["feature"].values return selectedFeatures
def test_feature_agglomeration(): n_clusters = 1 X = np.array([0, 0, 1]).reshape(1, 3) # (n_samples, n_features) agglo_mean = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.mean) agglo_median = FeatureAgglomeration(n_clusters=n_clusters, pooling_func=np.median) with pytest.warns(None) as record: agglo_mean.fit(X) assert not len(record) with pytest.warns(None) as record: agglo_median.fit(X) assert not len(record) assert np.size(np.unique(agglo_mean.labels_)) == n_clusters assert np.size(np.unique(agglo_median.labels_)) == n_clusters assert np.size(agglo_mean.labels_) == X.shape[1] assert np.size(agglo_median.labels_) == X.shape[1] # Test transform Xt_mean = agglo_mean.transform(X) Xt_median = agglo_median.transform(X) assert Xt_mean.shape[1] == n_clusters assert Xt_median.shape[1] == n_clusters assert Xt_mean == np.array([1 / 3.0]) assert Xt_median == np.array([0.0]) # Test inverse transform X_full_mean = agglo_mean.inverse_transform(Xt_mean) X_full_median = agglo_median.inverse_transform(Xt_median) assert np.unique(X_full_mean[0]).size == n_clusters assert np.unique(X_full_median[0]).size == n_clusters assert_array_almost_equal(agglo_mean.transform(X_full_mean), Xt_mean) assert_array_almost_equal(agglo_median.transform(X_full_median), Xt_median)
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rnd = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rnd.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with warnings.catch_warnings(record=True) as warning_list: warnings.simplefilter("always", DeprecationWarning) if hasattr(np, 'VisibleDeprecationWarning'): # Let's not catch the numpy internal DeprecationWarnings warnings.simplefilter('ignore', np.VisibleDeprecationWarning) ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) assert_equal(len(warning_list), 1) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def _feature_agglomeration_fit_method(data, n_parcels, connectivity, linkage): """Feature Agglomeration algorithm to fit on the data. Parameters ---------- data : array_like, shape=(n_samples, n_voxels) Masked subjects data n_parcels : int Number of parcels to parcellate. connectivity : ndarray Connectivity matrix Defines for each feature the neighbouring features following a given structure of the data. linkage : str which linkage criterion to use. 'ward' or 'linkage' or 'average' Returns ------- labels : ndarray Labels to the data """ ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity, linkage=linkage) ward.fit(data) return ward.labels_
def feature_agglomeration(voters_data, n, rounding=False): featagg = FeatureAgglomeration(n_clusters=n) featagg.fit(voters_data) condensed = featagg.transform(voters_data) feature_groups_map = dict(zip(voters_data.columns, featagg.labels_)) feature_groups_nos = [] for feature_group_key in feature_groups_map: feature_groups_nos.append(feature_groups_map[feature_group_key]) feature_groups_nos group_labels = [] for feature_group_no in set(feature_groups_nos): group_label = "" for feature_groups_key in feature_groups_map: if feature_groups_map[feature_groups_key] == feature_group_no: group_label = group_label + feature_groups_key + ", " group_labels.append(group_label[0:-2]) group_labels voters_agglomerated = pd.DataFrame(condensed, columns=group_labels, index=voters_data.index) if rounding == True: voters_agglomerated = voters_agglomerated.applymap(lambda x: round(x)) print("🔹→💠←🔹 {} features agglomerated into {} hybrid features.".format( len(voters_data.columns), len(voters_agglomerated.columns))) return voters_agglomerated
def test_ward_agglomeration(): """ Check that we obtain the correct solution in a simplistic case """ rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) assert_warns(DeprecationWarning, WardAgglomeration) with ignore_warnings(): ward = WardAgglomeration(n_clusters=5, connectivity=connectivity) ward.fit(X) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert_array_equal(agglo.labels_, ward.labels_) assert_true(np.size(np.unique(agglo.labels_)) == 5) X_red = agglo.transform(X) assert_true(X_red.shape[1] == 5) X_full = agglo.inverse_transform(X_red) assert_true(np.unique(X_full[0]).size == 5) assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError assert_raises(ValueError, agglo.fit, X[:0])
def feature_agglomeration(X, args={}): """ 使用层次聚类对特征进行聚类,然后进行特征降维 """ from sklearn.cluster import FeatureAgglomeration fam = FeatureAgglomeration(**args) fam.fit(X) return fam
class FeatureAgglomerationDecomposer(Transformer): type = 11 def __init__(self, n_clusters=2, affinity='euclidean', linkage='ward', pooling_func='mean', random_state=1): super().__init__("feature_agglomeration_decomposer") self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.compound_mode = 'only_new' self.output_type = NUMERICAL self.n_clusters = n_clusters self.affinity = affinity self.linkage = linkage self.pooling_func = pooling_func self.random_state = random_state self.pooling_func_mapping = dict(mean=np.mean, median=np.median, max=np.max) @ease_trans def operate(self, input_datanode, target_fields=None): from sklearn.cluster import FeatureAgglomeration X, y = input_datanode.data if self.model is None: self.n_clusters = int(self.n_clusters) n_clusters = min(self.n_clusters, X.shape[1]) if not callable(self.pooling_func): self.pooling_func = self.pooling_func_mapping[self.pooling_func] self.model = FeatureAgglomeration( n_clusters=n_clusters, affinity=self.affinity, linkage=self.linkage, pooling_func=self.pooling_func) self.model.fit(X) X_new = self.model.transform(X) return X_new @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): cs = ConfigurationSpace() n_clusters = UniformIntegerHyperparameter("n_clusters", 2, 400, default_value=25) affinity = CategoricalHyperparameter( "affinity", ["euclidean", "manhattan", "cosine"], default_value="euclidean") linkage = CategoricalHyperparameter( "linkage", ["ward", "complete", "average"], default_value="ward") pooling_func = CategoricalHyperparameter( "pooling_func", ["mean", "median", "max"], default_value="mean") cs.add_hyperparameters([n_clusters, affinity, linkage, pooling_func]) affinity_and_linkage = ForbiddenAndConjunction( ForbiddenInClause(affinity, ["manhattan", "cosine"]), ForbiddenEqualsClause(linkage, "ward")) cs.add_forbidden_clause(affinity_and_linkage) return cs
def untangle(X: Iterable, y: Iterable, n_clusters: int = None, get_connectivity: bool = True, compute_distances: bool = True, kind: str = 'correlation', agglo_kws: Union[dict, Bunch] = None) -> FeatureAgglomeration: from nilearn.connectome import ConnectivityMeasure as CM from sklearn.cluster import FeatureAgglomeration from sklearn.covariance import LedoitWolf from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import mutual_info_classif agglo_defs = dict(affinity='euclidean', compute_full_tree='auto', linkage='ward', pooling_func=np.mean, distance_threshold=None, compute_distances=compute_distances) if get_connectivity is True: connect_mat = CM(LedoitWolf(), kind=kind).fit_transform([X.values])[0] else: connect_mat = None if n_clusters is None: n_clusters = divmod(X.shape[1], 2)[0] - 1 if n_clusters == 0: n_clusters = 1 if agglo_kws is None: agglo_kws = {} agglo_defs.update(agglo_kws) agglo = FeatureAgglomeration(n_clusters=n_clusters, connectivity=connect_mat, **agglo_defs) if not isinstance(y, pd.Series): y = pd.Series(y) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) agglo.fit(X, y) setattr( agglo, 'cluster_indexes_', pd.DataFrame(zip(agglo.labels_, agglo.feature_names_in_), columns=['cluster', 'feature']).groupby('cluster').feature) skb = SelectKBest(k=1, score_func=mutual_info_classif) factor_leaders_ = [ skb.fit(X[itm[1]], y).get_feature_names_out()[0] for itm in tuple(agglo.cluster_indexes_) ] setattr(agglo, 'factor_leaders_', factor_leaders_) return agglo
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'feature_agglomeration.model') model = FeatureAgglomeration(n_clusters=target_output_dim) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return FeatureAgglomerationEncoder(model_path=model_path)
def test_feature_agglomeration_feature_names_out(): """Check `get_feature_names_out` for `FeatureAgglomeration`.""" X, _ = make_blobs(n_features=6, random_state=0) agglo = FeatureAgglomeration(n_clusters=3) agglo.fit(X) n_clusters = agglo.n_clusters_ names_out = agglo.get_feature_names_out() assert_array_equal([f"featureagglomeration{i}" for i in range(n_clusters)], names_out)
def get_clusters(X: pd.DataFrame, n_clusters: int): clt = FeatureAgglomeration(n_clusters=n_clusters) clt.fit(X) clusters = [] for i in range(n_clusters): clusters.append(X.columns[clt.labels_ == i].tolist()) return clusters # type: list[str]
def main(): # Parameters data_directory = '../../data/generated-data-r-10-n-6-4/' features_path = '../../data/features-generated-data-r-10-n-6-4' booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5] false_objects_indexes = [6, 7, 8, 9] file_names = os.listdir(data_directory) img_ids_vector = [int(name.split('-')[0]) for name in file_names] ratings_vector = [int(name.split('-')[-2]) for name in file_names] name_vector = [data_directory + name for name in file_names] images_indexes = [name.split('-')[3].split('.')[0] for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features = get_features(features_path, name_vector) fa = FeatureAgglomeration(n_clusters=50) fa.fit(features) features = fa.transform(features) scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-6-4-rf-fa-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='rf') selection.transform(ids=img_ids_vector, features=features, ratings=ratings_vector, users_ratings=ratings_matrix, users=users_matrix, cv_results_file=cv_results_file, images_indexes=images_indexes, true_objects_indexes=true_objects_indexes, false_objects_indexes=false_objects_indexes, paths=name_vector, z_score=False) selection.evaluate(evaluation_metric='auc') selection.evaluate(evaluation_metric='rmse') print('\n\n-----\n\n') score_auc, score_rmse = selection.evaluate(evaluation_metric='auc') scores_auc.append(score_auc) scores_rmse.append(score_rmse) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-auc.csv' save_scores(scores_auc, results_file) results_file = '../scores/generated-data-r-10-n-6-4-rf-fa-rmse.csv' save_scores(scores_rmse, results_file)
def variable_clustering(self, X_cat, woe_iv_df, n_clusters=15): X_transformed = mt.BinWoe().transform_x_all(X_cat, woe_iv_df) agglo = FeatureAgglomeration(n_clusters=n_clusters) if len(X_transformed) > 20000: X_agglo = X_transformed.sample(20000) else: X_agglo = X_transformed.copy() agglo.fit(X_agglo) vars_clusters = pd.DataFrame(data={'指标英文':X_transformed.columns.tolist(), 'cluster':list(agglo.labels_)})\ .sort_values('cluster') return vars_clusters, X_transformed
def cont_feature_clusters_sklearn(self, n_clusters = 5): """ This uses feature agglomeration from scikit learn and only works for continuous variables Eventually expand this to categorical variables using Cramer's V covariance matrix similar to R tool using the iclust package """ #Import the library from sklearn.cluster import FeatureAgglomeration Cluster = FeatureAgglomeration(n_clusters=n_clusters) Cluster.fit(self._dataset.iloc[:,self._cont_index_predictors]) df = pd.DataFrame({'Variable':self._dataset.columns[self._cont_index_predictors], 'Cluster':Cluster.labels_}) return df.sort_values(by='Cluster')
def _ward_fit_transform(all_subjects_data, fit_samples_indices, connectivity, n_parcels, offset_labels): """Ward clustering algorithm on a subsample and apply to the whole dataset. Computes a brain parcellation using Ward's clustering algorithm on some images, then averages the signal within parcels in order to reduce the dimension of the images of the whole dataset. This function is used with Randomized Parcellation Based Inference, so we need to save the labels to further perform the inverse transformation operation. The function therefore needs an offset to be applied on the labels so that they are unique across parcellations. Parameters ---------- all_subjects_data : array_like, shape=(n_samples, n_voxels) Masked subject images as an array. fit_samples_indices : array-like, Indices of the samples used to compute the parcellation. connectivity : scipy.sparse.coo_matrix, Graph representing the spatial structure of the images (i.e. connections between voxels). n_parcels : int, Number of parcels for the parcellations. offset_labels : int, Offset for labels numbering. The purpose is to have different labels in all the parcellations that can be built by multiple calls to the current function. Returns ------- parcelled_data : numpy.ndarray, shape=(n_samples, n_parcels) Average signal within each parcel for each subject. labels : np.ndarray, shape=(n_voxels,) Labels giving the correspondance between voxels and parcels. """ # fit part data_fit = all_subjects_data[fit_samples_indices] ward = FeatureAgglomeration(n_clusters=n_parcels, connectivity=connectivity) ward.fit(data_fit) # transform part labels = ward.labels_ + offset_labels # unique labels across parcellations parcelled_data = ward.transform(all_subjects_data) return parcelled_data, labels
def cluster_sentences(sentences, nb_of_clusters=5): tfidf_vectorizer = TfidfVectorizer(tokenizer=word_tokenizer, stop_words=stopwords.words('english'), max_df=0.9, min_df=0.05, lowercase=True) #builds a tf-idf matrix for the sentences tfidf_matrix_1 = tfidf_vectorizer.fit_transform(sentences) tfidf_matrix = tfidf_matrix_1.todense() kmeans = FeatureAgglomeration(n_clusters=nb_of_clusters) kmeans.fit(tfidf_matrix) clusters = collections.defaultdict(list) for i, label in enumerate(kmeans.labels_): clusters[label].append(i) return dict(clusters)
def data_compression(fmri_masked, mask_img, mask_np, output_size): """ data : array_like A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask : a numpy array of the mask output_size : integer The number of elements that the data should be reduced to """ ## Transform nifti files to a data matrix with the NiftiMasker import time from nilearn import input_data datacompressiontime = time.time() nifti_masker = input_data.NiftiMasker(mask_img=mask_img, memory='nilearn_cache', mask_strategy='background', memory_level=1, standardize=False) ward = [] # Perform Ward clustering from sklearn.feature_extraction import image shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) #import pdb;pdb.set_trace() from sklearn.cluster import FeatureAgglomeration start = time.time() ward = FeatureAgglomeration(n_clusters=output_size, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) #print("Ward agglomeration compressing voxels into clusters: %.2fs" % (time.time() - start)) labels = ward.labels_ #print ('Extracting reduced Dimension Data') data_reduced = ward.transform(fmri_masked) fmri_masked = [] #print('Data compression took ', (time.time()- datacompressiontime), ' seconds') return {'data': data_reduced, 'labels': labels}
def data_compression(fmri_masked, mask_img, mask_np, compression_dim): # TODO @AKI update doc """ Perform... Parameters ---------- fmri_masked : np.ndarray[ndim=2] A matrix of shape (`V`, `N`) with `V` voxels `N` timepoints The functional dataset that needs to be reduced mask_img : an nibabel img object of the mask mask_np : a numpy array of the mask compression_dim : integer The number of elements that the data should be reduced to Returns ------- A dictionaty ... """ from sklearn.feature_extraction import image from sklearn.cluster import FeatureAgglomeration # Perform Ward clustering shape = mask_np.shape connectivity = image.grid_to_graph(n_x=shape[0], n_y=shape[1], n_z=shape[2], mask=mask_np) ward = FeatureAgglomeration(n_clusters=compression_dim, connectivity=connectivity, linkage='ward') ward.fit(fmri_masked) labels = ward.labels_ data_reduced = ward.transform(fmri_masked) return { 'compressor': ward, 'compressed': data_reduced, 'labels': labels, }
def agglomeration(self, n_clusters): cv = KFold(len(self.y_train), 5) agglo = FeatureAgglomeration(n_clusters=n_clusters) agglo.fit(self.X_train) grouped_names = list([]) grouped_idx = list([]) col_idx = np.arange(len(self.cols)) for label in np.unique(agglo.labels_): group = [ name for (name, group_id) in zip(self.cols, agglo.labels_) if group_id == label ] group_idx = [ idx for (idx, group_id) in zip(col_idx, agglo.labels_) if group_id == label ] grouped_names.append(group) grouped_idx.append(group_idx) return grouped_names, grouped_idx
def test_ward_agglomeration(): # Check that we obtain the correct solution in a simplistic case rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) X = rng.randn(50, 100) connectivity = grid_to_graph(*mask.shape) agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) agglo.fit(X) assert np.size(np.unique(agglo.labels_)) == 5 X_red = agglo.transform(X) assert X_red.shape[1] == 5 X_full = agglo.inverse_transform(X_red) assert np.unique(X_full[0]).size == 5 assert_array_almost_equal(agglo.transform(X_full), X_red) # Check that fitting with no samples raises a ValueError with pytest.raises(ValueError): agglo.fit(X[:0])
class Hierarchi(object): """All hierarchical algorithms are implemened here.""" def __init__(self, method, data, n_clusters=2, random_state=0): """ Initialize all the parameters. method: Name of the algorithms (lower case joined by underscore) data: Data (2D Matrix) n_clusters: Number of clusters random_state: Random initial state """ self.method = method self.data = data self.n_clusters = n_clusters np.random.seed(random_state) self.affinity = "euclidean" self.linkage = "ward" self.distance_threshold = None return def setup(self, keywords={}): """ Setup the algorithms """ for p in keywords.keys(): setattr(self, p, keywords[p]) if self.method == "agglomerative": self.obj = AgglomerativeClustering(n_clusters=self.n_clusters, linkage=self.linkage, affinity=self.affinity) if self.method == "feature": self.obj = FeatureAgglomeration(n_clusters=self.n_clusters, linkage=self.linkage, affinity=self.affinity, distance_threshold=self.distance_threshold) return def run(self): """ Run the models """ if self.method == "agglomerative": self.obj.fit(self.data) if self.method == "feature": self.obj.fit(self.data) return
def run_evaluation(): parser = argparse.ArgumentParser() parser.add_argument('--input', help="Image folder.", default="faces") parser.add_argument('--output', help="Statistics output folder.", default="stats") args = parser.parse_args() # load embeddings emb_1 = load_embeddings('embeddings_matthias.pkl') emb_2 = load_embeddings('embeddings_laia.pkl') emb_3 = load_embeddings('embeddings_elias.pkl') emb_lfw = load_embeddings('embeddings_lfw.pkl') if emb_1 is None or emb_2 is None: print "--- embeddings could not be loaded. Aborting..." return # ------------------- EVALUATION ON ORIGINAL VECTORS ph = PlotHandler() # ==== 1. PCA DIMENSION REDUCTION # ph.PlotVarianceContribution(emb_lfw) # # reduce dimensionality # basis, mean = ExtractSubspace(emb_lfw, 0.999) # # dump_to_hd("lfw_99.9_subspace.pkl", (basis, mean)) # # reduced_data = ProjectOntoSubspace(emb_lfw, mean, basis) # ph.SetTitle("Component Variance Contribution on Subspace") # ph.PlotVarianceContribution(reduced_data) # ph.Show() # ==== 1. FEATURE AGGLOMERATION agglo = FeatureAgglomeration(n_clusters=20) agglo.fit(emb_lfw) X_reduced = agglo.transform(emb_1) print np.shape(X_reduced)
def perform_feature_agglomeration(train_X, train_Y, test_X, test_Y): n_clusters = [32] fagg_model_accuracies = pd.DataFrame() for n_cluster in n_clusters: agglo = FeatureAgglomeration(connectivity=None, n_clusters=n_cluster) agglo.fit(train_X) train_X_reduced = agglo.transform(train_X) test_X_reduced = agglo.transform(test_X) svc_acc_val = perform_svc(train_X_reduced, train_Y, test_X_reduced, test_Y) rfc_acc_val = perform_rfc(train_X_reduced, train_Y, test_X_reduced, test_Y) knn_acc_val = perform_knn(train_X_reduced, train_Y, test_X_reduced, test_Y) lr_acc_val = perform_linear_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) lc_acc_val = perform_linear_lasso(train_X_reduced, train_Y, test_X_reduced, test_Y) rr_acc_val = perform_ridge_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) enet_acc_val = perform_elastinet_regression(train_X_reduced, train_Y, test_X_reduced, test_Y) fagg_model_accuracies = fagg_model_accuracies.append([ svc_acc_val, rfc_acc_val, knn_acc_val, lr_acc_val, lc_acc_val, rr_acc_val, enet_acc_val ]) cols = list(fagg_model_accuracies.columns.values) cols = cols[-1:] + cols[:-1] fagg_model_accuracies = fagg_model_accuracies[cols] fagg_model_accuracies = fagg_model_accuracies.sort_values( by='r2_score') return fagg_model_accuracies
def construction_pathway_clusters_groups(data_path=data_tn_new_label_unbalanced_cpg_rna_rna_iso_mirna, model_loaded=False, return_views='all', output_file_name='pathway_file_clusters_genes', model_agglomeration_file_name='feature_agglomeration_model.pck'): """ Construct group based on clustering Args: data_path: str, data path model_loaded: bool, if true, load the model_agglomeration_file_name, if false re-do the fit return_views: str, correct view for the group output_file_name: str, output file name model_agglomeration_file_name: path to .pck file if we already run the feature agglomeration fit Returns: output_file_name """ from sklearn.cluster import FeatureAgglomeration agglo = FeatureAgglomeration(n_clusters=1000) x, y, features_names, _ = load_data(data=data_path, return_views=return_views) output_file_name = output_file_name + '_{}.tsv'.format(return_views) if model_loaded: assert model_agglomeration_file_name != '', 'You should give the model agglomeration name file' f = open(model_agglomeration_file_name, 'rb') agglo = pickle.load(f) groups_and_features = list(zip(features_names, agglo.labels_)) with open(output_file_name, 'w') as f: f.write('G\tIDS\n') for zip_el in groups_and_features: f.write('{}\t{}\n'.format(zip_el[1], zip_el[0])) else: f = open(model_agglomeration_file_name, 'wb') agglo.fit(x) pickle.dump(agglo, f) groups_and_features = list(zip(features_names, agglo.labels_)) with open(output_file_name, 'w') as f: f.write('G\tIDS\n') for zip_el in groups_and_features: f.write('{}\t{}\n'.format(zip_el[1], zip_el[0]))
def FeatureClusters(p_data, p_predictors, p_numeric_cat_index=np.array([]), p_n_clusters=5): """ This uses feature agglomeration from scikit learn and only works for continuous variables Eventually expand this to categorical variables using Cramer's V covariance matrix similar to R tool using the iclust package """ # Find clusters of correlated (continuous) variables cont_index = np.intersect1d(p_predictors, ContCatSplit(p_data, p_numeric_cat_index)[0]) #Import the library from sklearn.cluster import FeatureAgglomeration Cluster = FeatureAgglomeration(n_clusters=p_n_clusters) Cluster.fit(p_data.iloc[:, cont_index]) df = pd.DataFrame({ 'Variable': p_data.columns[cont_index], 'Cluster': Cluster.labels_ }) return df.sort_values(by='Cluster')
def test_random_feature_agglomeration_encoder_load(): train_data = np.random.rand(2000, input_dim) from sklearn.cluster import FeatureAgglomeration model = FeatureAgglomeration(n_clusters=target_output_dim) filename = 'feature_agglomeration_model.model' pickle.dump(model.fit(train_data), open(filename, 'wb')) encoder = TransformEncoder(model_path=filename) test_data = np.random.rand(10, input_dim) encoded_data = encoder.encode(test_data) transformed_data = model.transform(test_data) assert encoded_data.shape == (test_data.shape[0], target_output_dim) assert type(encoded_data) == np.ndarray np.testing.assert_almost_equal(transformed_data, encoded_data) save_and_load(encoder, False) save_and_load_config(encoder, False, train_data) rm_files([encoder.save_abspath, encoder.config_abspath, filename])
def projection(X, k, connectivity, ward=True): """ Take the data, and returns a matrix, to reduce the dimension Returns, invP, (P.(X.T)).T and the underlying labels """ n, p = X.shape if ward: clustering = FeatureAgglomeration( linkage='ward', n_clusters=k, connectivity=connectivity) labels = clustering.fit(X).labels_ else: from fast_cluster import ReNN, recursive_nn _, labels = recursive_nn(connectivity, X, n_clusters=k) # P, P_inv = pp_inv(labels) X_proj = P.dot(X.T).T # should be done through clustering.transform, but there is an issue # with the normalization # X_proj = clustering.transform(X) return P_inv, X_proj, labels
# Transpose and scale parameters featListOriginal[[0,1,5,7,9,10],:] = featListOriginal[[0,1,5,7,9,10],:]*0.8 featListOriginal[[2,3,4],:] = featListOriginal[[2,3,4],:]*0.8*0.2 featListOriginal[6,:] = featListOriginal[6,:]*0.8*0.8 featListOriginal = NP.transpose(featListOriginal) ## STANDARDIZE FEATURES ################################################### # Don't standardize the centroids for k in range(2,numFeat): featList[k] = (featList[k] - NP.mean(featList[k]))/NP.sqrt(NP.var(featList[k])) # Transpose the feature list to use in clustering featList = NP.transpose(featList) feat_aggl = FeatureAgglomeration(2) feat_aggl.fit(featList[:,2:]) ## AGGLOMERATIVE CLUSTERING ############################################### aggl_all = AgglomerativeClustering(2) X_All = featList[:,2:] y2 = aggl_all.fit_predict(X_All) ## PCA ############################################################### pca_model = PCA(2) X_PCA = pca_model.fit_transform(X_All) print(pca_model.explained_variance_ratio_) ## SPLIT INTO numBINS ############################################### percentiles = NP.floor(NP.linspace(0,100,numBins+1))
def FeatureAgglomeration(self, clist, numClusters=2): FEATAGGL = FeatureAgglomeration(numClusters) FEATAGGL.fit(self.featList[:,clist]) self.featureTree = FEATAGGL.children_ self.featureLabels = FEATAGGL.labels_ self.featureCList = clist
def find_cluster(childrens, start, height, sample_size): res = start for i in range(height - 1): res = find_feature(childrens, res + sample_size) cluster = rec_cluster(childrens, childrens[res], sample_size) cluster.sort() return cluster def find_feature_cluster(children, feature, height, sample_size): return find_cluster(children, find_feature(children, feature), height, sample_size) BENCH_DATA = genfromtxt('Sequential_Application_SATUNSAT_track_wo_names.csv', delimiter=',') #BENCH_DATA = BENCH_DATA.transpose() print(BENCH_DATA.shape) #print(np.isnan(BENCH_DATA).any()) ward = FeatureAgglomeration(linkage='average') #print(ward.fit_predict(BENCH_DATA)) ward.fit(BENCH_DATA) #print(ward.children_) #print(find_feature_cluster(ward.children_, 0, 2, 300)) plt.title('SAT Feature_Agglomeration') plot_dendrogram(ward, leaf_font_size = 12) #plt.savefig('SAT_Feature_Agglomeration.png') plt.show()
out_dir='/Users/aki.nikolaidis/PyBASC_outputs/Self_Sim_WWS/dim_500_correlation_2_clusters_100_IndBS_1_blockcorrelation' ismdir=out_dir + '/workflow_output/basc_workflow_runner/basc/individual_stability_matrices/mapflow/' os.chdir(ismdir) subdirs_all = [x[1] for x in os.walk(ismdir)] subdirs=subdirs_all[0] n_clusters=2 #roi_mask_nparray = nb.load(roi_mask_file).get_data().astype('float32').astype('bool') ward = FeatureAgglomeration(n_clusters=n_clusters, affinity='euclidean', linkage='ward') for subdir in subdirs: os.chdir(ismdir + subdir) temp_ism_file = os.path.join(os.getcwd(), 'individual_stability_matrix.npy') temp_ism=np.load(temp_ism_file) print("Calculating Hierarchical Clustering") ward.fit(temp_ism) y_pred = ward.labels_.astype(np.int) #APPLY CLUSTERING all_ind_clusterlabels.append(y_pred) #for ism in ismlist: #temp=np.load(ism) #all_ind_clusterlabels.append(temp) score_similarities= np.zeros((len(all_ind_clusterlabels),len(all_ind_clusterlabels))) for i in range(len(all_ind_clusterlabels)): for j in range(len(all_ind_clusterlabels)):
x_columns = X_copy.columns X_copy = pd.DataFrame(poly.fit_transform(X_copy)) X_copy.columns = poly.get_feature_names(x_columns) # add the term-document matrix to X X_copy = pd.concat([X_copy, matrix], axis=1) # drop any constant columns in X X_copy = X_copy.loc[:, (X_copy != X_copy.iloc[0]).any()] # standardize the data to take on values between 0 and 1 X = ((X_copy - X_copy.min()) / (X_copy.max() - X_copy.min())).copy() # build the feature selection model num = 222 hclust = FeatureAgglomeration(n_clusters=num, linkage="ward", distance_threshold=None) hclust.fit(X) # collect the features to keep clusters = hclust.labels_ keep = [] for i in range(num): keep.append(np.where(clusters == i)[0][0]) X = X_copy.iloc[:, keep] # export the data X.to_csv("X titanic.csv", index=False) Y.to_csv("Y titanic.csv", index=False)
def fit(self, data, n_clusters, method): data = np.array(data) data = preprocessing.MinMaxScaler().fit_transform(data) model = FeatureAgglomeration(n_clusters=n_clusters, linkage=method) clustering = model.fit(data) return clustering
################################################################## # Then we use FeatureAgglomeration from scikit-learn. Indeed, the voxels # are the features of the data matrix. # # In addition, we use caching. As a result, the clustering doesn't have # to be recomputed later. # Computing the ward for the first time, this is long... from sklearn.cluster import FeatureAgglomeration # If you have scikit-learn older than 0.14, you need to import # WardAgglomeration instead of FeatureAgglomeration import time start = time.time() ward = FeatureAgglomeration(n_clusters=1000, connectivity=connectivity, linkage='ward', memory='nilearn_cache') ward.fit(fmri_masked) print("Ward agglomeration 1000 clusters: %.2fs" % (time.time() - start)) # Compute the ward with more clusters, should be faster as we are using # the caching mechanism start = time.time() ward = FeatureAgglomeration(n_clusters=2000, connectivity=connectivity, linkage='ward', memory='nilearn_cache') ward.fit(fmri_masked) print("Ward agglomeration 2000 clusters: %.2fs" % (time.time() - start)) ################################################################## # Visualize results # ------------------ # # First we display the labels of the clustering in the brain.