def fa_dim_red(x_train_scaled, dataset_name, features_num = 2): z=0 losses = [] for k in range(1, x_train_scaled.shape[1]+1): fa = FeatureAgglomeration(n_clusters=k) fa_result = fa.fit_transform(x_train_scaled) x_projected_fa = fa.inverse_transform(fa_result) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() losses.append(loss) np_feature_losses_percent = np.multiply(100, losses/np.sum(losses)) print('num of clustrs < 10% loss') for i in range(len(np_feature_losses_percent)): z=z+np_feature_losses_percent[i] if z>90: print(i+1) break print(np_feature_losses_percent) plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent) plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")") plt.ylabel("Mean Squared Error (% of Total)") plt.xlabel("Features") plt.savefig((str(dataset_name))+' fa analysis.png') plt.show() fa = FeatureAgglomeration(n_clusters=features_num) fa_result = fa.fit_transform(x_train_scaled, y_train) print(fa_result.shape) x_projected_fa = fa.inverse_transform(fa_result) print(x_projected_ica.shape) print(x_train_scaled.shape) loss = ((x_train_scaled - x_projected_fa) ** 2).mean() print('loss') print(loss) return fa_result,x_projected_fa
def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False): compressions = [] # Finding features diffnet = DiffNet(self.db, db_path=self.db_path) diffnet.restore(diffnet_paht) print("Calculating features for", len(cluster_images), "images") for img in cluster_images: print("Finding features for:", img) one_hot = diffnet.feedforward(img, cluster_db_path) output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.}) compressions.append(output[0]) # Clustering print("Performing clustering...") compressions = np.array(compressions) fa = FeatureAgglomeration(n_clusters=30) X_clusters = fa.fit_transform(compressions) print("Collecting data...") csv_dict_arr = [] for i, img in enumerate(cluster_images): csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]}) # Saving if save_csv: print("Saving data to csv...") keys = load_label_list(csv_dict_arr[0]) with open('cluster_result.csv', 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys, delimiter=';') dict_writer.writeheader() dict_writer.writerows(csv_dict_arr) return csv_dict_arr
class Regressor(BaseEstimator): def __init__(self): self.clf = Pipeline([ ("RF", RandomForestRegressor(n_estimators=200, max_depth=15, n_jobs=N_JOBS))]) self.scaler = StandardScaler() self.agglo = FeatureAgglomeration(n_clusters=500) def fit(self, X, y): y = y.ravel() n_samples, n_lags, n_lats, n_lons = X.shape self.scaler.fit(X[:, -1].reshape(n_samples, -1)) X = X.reshape(n_lags * n_samples, -1) connectivity = grid_to_graph(n_lats, n_lons) self.agglo.connectivity = connectivity X = self.scaler.transform(X) X = self.agglo.fit_transform(X) X = X.reshape(n_samples, -1) self.clf.fit(X, y) def predict(self, X): n_samples, n_lags, n_lats, n_lons = X.shape X = X.reshape(n_lags * n_samples, -1) X = self.scaler.transform(X) X = self.agglo.transform(X) X = X.reshape(n_samples, -1) return self.clf.predict(X)
class AgglomerativeFeatureTransformer: def __init__(self, n_clusters=2): self.model = FeatureAgglomeration(n_clusters=n_clusters) def __call__(self, data): if data is None or data.shape[0] == 0: return None return self.model.fit_transform(data)
def do_feature_agglomoration(self, data): print("Using feature agglomoration to reduce the matrix' dimensionality...") if self.k: n = self.k else: n = 20 agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete") return agglo.fit_transform(data)
def token_cluster(self, n_clusters=300): from scipy import sparse from sklearn.cluster import FeatureAgglomeration FA = FeatureAgglomeration(n_clusters=3000) self.bow_corpus = FA.fit_transform(self.bow_corpus) self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
def apply_feature_agglomeration(table, features, label, n_components): from sklearn.cluster import FeatureAgglomeration from paje import feature_file_processor x, y = feature_file_processor.split_features_target(table, features, label) fa = FeatureAgglomeration(n_clusters=n_components, linkage='ward') pc = fa.fit_transform(x) return feature_file_processor.generate_data_frame(pc, table[[label]])
def dim_reduction_FA(data, distance_threshold=0.45): """ Params: data: ndarry of shape (n_samples, n_features) distance_threshold: Optimal threshold value for similarity measure Returns: (reducedDimData, nReducedComponents) """ agglo = FeatureAgglomeration(n_clusters=None,distance_threshold=distance_threshold) reducedDimData = agglo.fit_transform(data) return reducedDimData, agglo.n_clusters_
def getDR(dt_all, n_comp=12): # cols cols_encode_label = dt_all.filter( regex="Encode_Label").columns.values.tolist() cols_cat = dt_all.drop( "ID", axis=1).select_dtypes(include=["object"]).columns.tolist() # standardize dt_all_norm = MinMaxScaler().fit_transform( dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1)) # tSVD tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results = tsvd.fit_transform(dt_all_norm) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(dt_all_norm) # ICA ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420) ica_results = ica.fit_transform(dt_all_norm) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(dt_all_norm) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(dt_all_norm) # NMF nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420) nmf_results = nmf.fit_transform(dt_all_norm) # F*G f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward') fag_results = f*g.fit_transform(dt_all_norm) # Append decomposition components to datasets for i in range(1, n_comp + 1): dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1] dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1] dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1] dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1] dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1] dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1] dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1] return (dt_all)
def feat_agglom(self, n_clusters, standardize=True): """ Method for running feature agglomeration Args: df (Dataframe) target_var (String) n_clusters (Integer) Standardize (Boolean) Attributes: df: Pandas dataframe containing the target and feature variables target_var: The target variable n_clusters: Number of clusters to return Returns: Pandas dataframe containing the feature name and the cluster number """ df = self.df cat_features = df.loc[:, df.dtypes == object] if not cat_features.empty: df = self.prep_cat_vars(df) X = df.drop([self.target_var], axis=1) X_df = X X = X_df.values if standardize == True: scaler = StandardScaler() X = scaler.fit_transform(X) agglo = FeatureAgglomeration(n_clusters=n_clusters) clusters = agglo.fit_transform(X) cluster_numbers = pd.DataFrame(agglo.labels_) feat_labels = pd.DataFrame(X_df.columns) var_clust = feat_labels.merge(cluster_numbers, left_index=True, right_index=True) var_clust.columns = ['Feature_Label', 'Cluster_Number'] var_clust.sort_values('Cluster_Number', inplace=True) return var_clust
def feat_agglom(self, n_clusters, standardize=True): """Function for running recursive feature elimination Parameters ---------- n_clusters : int Number of clusters to create from the variables standardize : bool Boolean identiying if features should be standardized before clustering Returns ------- var_clust : pandas.DataFrame Pandas DataFrame containing the feature name and the cluster number """ df = self.df cat_features = df.loc[:, df.dtypes == object] if not cat_features.empty: df = self.prep_cat_vars(df) X = df.drop([self.target_var], axis=1) X_df = X X = X_df.values if standardize == True: scaler = StandardScaler() X = scaler.fit_transform(X) agglo = FeatureAgglomeration(n_clusters=n_clusters) clusters = agglo.fit_transform(X) cluster_numbers = pd.DataFrame(agglo.labels_) feat_labels = pd.DataFrame(X_df.columns) var_clust = feat_labels.merge( cluster_numbers, left_index=True, right_index=True ) var_clust.columns = ["features", "Cluster_Number"] var_clust.sort_values("Cluster_Number", inplace=True) return var_clust
1.4], [7.7, 3., 6.1, 2.3], [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5, 1.8], [6., 3., 4.8, 1.8], [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4], [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9], [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7, 2.5], [6.7, 3., 5.2, 2.3], [6.3, 2.5, 5., 1.9], [6.5, 3., 5.2, 2.], [6.2, 3.4, 5.4, 2.3], [5.9, 3., 5.1, 1.8]]) print('Original shape: {}\n'.format(data.shape)) print('First 10:\n{}\n'.format(repr(data[:10]))) from sklearn.cluster import FeatureAgglomeration agg = FeatureAgglomeration(n_clusters=2) new_data = agg.fit_transform(data) print('New shape: {}\n'.format(new_data.shape)) print('First 10:\n{}\n'.format(repr(new_data[:10]))) ''' riginal shape: (150, 4) First 10: array([[5.1, 3.5, 1.4, 0.2], [4.9, 3. , 1.4, 0.2], [4.7, 3.2, 1.3, 0.2], [4.6, 3.1, 1.5, 0.2], [5. , 3.6, 1.4, 0.2], [5.4, 3.9, 1.7, 0.4], [4.6, 3.4, 1.4, 0.3], [5. , 3.4, 1.5, 0.2], [4.4, 2.9, 1.4, 0.2],
def elbowHeuristic_FA(data, markX=None, markY=None, annotX=None, annotY=None, figPath=None): """ Given the data on which PCA needs to be implemented, this function will plot the curve for elbow heuristics. Pass annotation parameters after manual inspection for final graph. Params: data: ndarry of shape (n_samples, n_features) - data on which FA needs to be performed markX: float - x-coordinate of the elbow point markY: float - y-coordinate of the elbow point annotX: float - x-coordinate where annotation text needs to be placed annotY: float - y-coordinate where annotation text needs to be placed figPath: string - Path where figure needs to be stored. If None, the figure is plotted in console itself. Returns: None """ distThr = [] nClusters = [] for i in np.arange(0, 1, 0.05): distThr.append(i) agglo = FeatureAgglomeration(n_clusters=None, distance_threshold=i) clustering = agglo.fit_transform(data) nClusters.append(clustering.shape[1]) plt.rc('font', size=15) # controls default text sizes plt.rc('axes', titlesize=17) # fontsize of the axes title plt.rc('axes', labelsize=17) # fontsize of the x and y labels plt.rc('xtick', labelsize=15) # fontsize of the tick labels plt.rc('ytick', labelsize=15) # fontsize of the tick labels plt.rc('legend', fontsize=15) # legend fontsize plt.rc('figure', titlesize=17) # fontsize of the figure title plt.figure(figsize=(10, 6)) plt.plot(distThr, nClusters) if markX is not None and markY is not None: plt.hlines(markY, 0, markX, linestyles='dashed') plt.vlines(markX, 0, markY, linestyles='dashed') plt.scatter([markX], [markY], c='r') ''' plt.annotate('Elbow point\n'+str((np.around(markX,3), 7)), xy=(markX, markY+0.2), xytext=(0.65, 15), arrowprops=dict(arrowstyle="->", connectionstyle="angle3,angleA=0,angleB=-90")); ''' if annotX is not None and annotY is not None: if annotY > markY: plt.annotate('Elbow point\n' + str( (np.around(markX, 3), markY)), xy=(markX, markY + 0.2), xytext=(annotX, annotY), arrowprops=dict( arrowstyle="->", connectionstyle="angle3,angleA=0,angleB=-90")) else: plt.annotate('Elbow point\n' + str( (np.around(markX, 3), markY)), xy=(markX, markY - 0.2), xytext=(annotX, annotY), arrowprops=dict( arrowstyle="->", connectionstyle="angle3,angleA=0,angleB=-90")) plt.xlim(0, 1) plt.ylim(0, 27) plt.xlabel("Distance Threshold") plt.ylabel("Number of Clusters - Reduced Dimensions") if figPath is not None: plt.savefig(figPath, bbox_inches='tight', pad_inches=0.05) plt.close()
X_test_pen = pen_scaler.transform(X_test_pen) sat_scaler.fit(X_test_sat) X_test_sat = sat_scaler.transform(X_test_sat) y_train_pen = [float(num) for num in y_train_pen] sat_params = [2,10,18] pen_params = [2,8,14] from sklearn.cluster import FeatureAgglomeration for i in range(3): sat_fa = FeatureAgglomeration(n_clusters=sat_params[i]) pen_fa = FeatureAgglomeration(n_clusters=pen_params[i]) X_train_sat = sat_fa.fit_transform(X_train_sat_og) X_train_pen = pen_fa.fit_transform(X_train_pen_og) plt.figure(figsize=(12, 6)) plt.scatter(X_train_sat[:,0], X_train_sat[:,1], c=y_train_sat) plt.title('Plot of Top Two Features - Feature Agglomeration of '+str(sat_params[i])+' Features (Satellite)') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show() plt.figure(figsize=(12, 6)) plt.scatter(X_train_pen[:,0], X_train_pen[:,1], c=y_train_pen) plt.title('Plot of Top Two Features - Feature Agglomeration of '+str(pen_params[i])+' Features (Digits)') plt.xlabel('Feature 1') plt.ylabel('Feature 2') plt.show()
do_tsne = True if [do_agglomeration, do_pca, do_tsne].count(True) != 1: logger.warn( 'Can do exactly one of FeatureAgglomeration, PCA, t-SNE.') quit() # todo add a test to make sure we do one of these if False: pass elif do_agglomeration: model = FeatureAgglomeration() if False: pass elif do_lda: try: scatter_points = model.fit_transform(lda_results) except AssertionError as assertionError: logger.warn('%s : %s', (input_file, assertionError)) elif do_lsa: scatter_points = model.fit_transform(lsa_results) elif do_nmf: scatter_points = model.fit_transform(nmf_results) elif do_pca: pass model = PCA(n_components=2, random_state=random_state) if False: pass elif do_lda: try: scatter_points = model.fit_transform(lda_results) except AssertionError as assertionError:
grp_train = grp.fit_transform(x_train) grp_test = grp.transform(x_test) # SRP srp = SparseRandomProjection(n_components=remaining_comp, dense_output=True, random_state=420) srp_train = srp.fit_transform(x_train) srp_test = srp.transform(x_test) # NMF nmf = NMF(n_components=remaining_comp, init='nndsvdar', random_state=420) nmf_train = nmf.fit_transform(x_train) nmf_test = nmf.transform(x_test) # F*G f*g = FeatureAgglomeration(n_clusters=remaining_comp, linkage='ward') fag_train = f*g.fit_transform(x_train) fag_test = f*g.transform(x_test) # for i in range(1, remaining_comp + 1): # x_train['pca_' + str(i)] = pca_train[:, i - 1] # x_test['pca_' + str(i)] = pca_test[:, i - 1] # # x_train['ica_' + str(i)] = ica_train[:, i - 1] # x_test['ica_' + str(i)] = ica_test[:, i - 1] # # x_train['tsvd_' + str(i)] = tsvd_train[:, i - 1] # x_test['tsvd_' + str(i)] = tsvd_test[:, i - 1] # # x_train['grp_' + str(i)] = grp_train[:, i - 1] # x_test['grp_' + str(i)] = grp_test[:, i - 1] #
data.append(str) labels.append(arr[0]) vectorizer = TfidfVectorizer(max_df=0.15, min_df=1, stop_words='english', use_idf=True, smooth_idf=True, norm='l2') X = vectorizer.fit_transform(data) ftrCluster = FeatureAgglomeration(n_clusters=20, affinity='euclidean', compute_full_tree='auto', linkage='ward') fittans = ftrCluster.fit_transform(X.toarray()) np.savetxt( '/users/grad/rakib/dr.norbert/dataset/shorttext/biomedical/semisupervised/hacftrtfidf_20', fittans, delimiter=' ', fmt='%.10f') ############ from sklearn.cluster import KMeans import numpy as np import sys import math from sklearn import svm import collections
ica1 = FastICA(n_components=20) data1_X_ica = ica1.fit_transform(data1_X_train) data1_X_ica_test = ica1.transform(data1_X_test) ica2 = FastICA(n_components=90) data2_X_ica = ica2.fit_transform(data2_X_train) data2_X_ica_test = ica2.transform(data2_X_test) grp1 = GaussianRandomProjection(n_components=20) data1_X_grp = grp1.fit_transform(data1_X_train) data1_X_grp_test = grp1.transform(data1_X_test) grp2 = GaussianRandomProjection(n_components=90) data2_X_grp = grp2.fit_transform(data2_X_train) data2_X_grp_test = grp2.transform(data2_X_test) fa1 = FeatureAgglomeration(n_clusters=20) data1_X_fa = fa1.fit_transform(data1_X_train) data1_X_fa_test = fa1.transform(data1_X_test) fa2 = FeatureAgglomeration(n_clusters=90) data2_X_fa = fa2.fit_transform(data2_X_train) data2_X_fa_test = fa2.transform(data2_X_test) ''' clustering ''' clusters = np.logspace(0.5, 2, num=10, endpoint=True, base=10.0, dtype=None) for i in range(0, len(clusters)): clusters[i] = int(clusters[i]) print clusters
# dfs = df[(df[:, 0] == 'A') | (df[:, 0] == 'B') | (df[:, 0] == 'C') | (df[:, 0] == 'D') | # (df[:, 0] == 'E') | (df[:, 0] == 'F') | (df[:, 0] == 'G') | (df[:, 0] == 'H') | # (df[:, 0] == 'I') | (df[:, 0] == 'J') | (df[:, 0] == 'K') | (df[:, 0] == 'L') | # (df[:, 0] == 'M') | (df[:, 0] == 'N') | (df[:, 0] == 'O')] data = df[:, 1:] data = scale(data) labels = df[:, 0] n_digits = len(np.unique(labels)) print(data.shape) print(data.std()) std_transf = np.zeros(15) x_axis = range(2, 17) for i in x_axis: transformer = FeatureAgglomeration(n_clusters=i) rp = transformer.fit_transform(data) print(rp.shape) std_transf[i - 2] = rp.std() plt.plot(x_axis, std_transf) plt.ylabel('STD of data') plt.xlabel('n_components') plt.title("STD Vs Dimenions For Feature Agglomeration") plt.show() # ############################################################################### # # Visualize the results on PCA-reduced data # # reduced_data = PCA(n_components=2).fit_transform(data) # kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) # kmeans.fit(reduced_data) #
# SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train) srp_results_test = srp.transform(test) # NMF nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420) nmf_results_train = nmf.fit_transform(train) nmf_results_test = nmf.transform(test) # F*G f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward') fag_results_train = f*g.fit_transform(train) fag_results_test = f*g.transform(test) # ### Filtering the most significant components and inserting in a Dataframe ### # In[ ]: dim_reds = list() train_pca = pd.DataFrame() test_pca = pd.DataFrame() train_ica = pd.DataFrame() test_ica = pd.DataFrame() train_tsvd = pd.DataFrame() test_tsvd = pd.DataFrame()
for component in range(1, len(X_train[0])+1): grp = GaussianRandomProjection(n_components=component, random_state=1) X_train_reduced = grp.fit_transform(X_train) X_test_reduced = grp.transform(X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_reduced, y_train) train_scores.append(knn.score(X_train_reduced, y_train)) test_scores.append(knn.score(X_test_reduced, y_test)) if dataset_name=='spam': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Spam dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1)) elif dataset_name=='letter': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Letter Recognition dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1)) #FA train_scores=[] test_scores=[] for component in range(1, len(X_train[0])+1): fa = FeatureAgglomeration(n_clusters=component) X_train_reduced = fa.fit_transform(X_train) X_test_reduced = fa.transform(X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_reduced, y_train) train_scores.append(knn.score(X_train_reduced, y_train)) test_scores.append(knn.score(X_test_reduced, y_test)) if dataset_name=='spam': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Spam dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1)) elif dataset_name=='letter': drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Letter Recognition dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
from sklearn.cross_validation import KFold from sklearn import metrics ############################################################################### # Data IO and generation # import some data to play with #file = "/home/kbhalla/Desktop/Data/day_samp_new.npy" file = "/home/rmendoza/Documents/Data/day_samp_new_0604.npy" with open(file, "r") as file_in: matrix = smio.load_sparse_csr(file_in) X = matrix[:,:-1] FA = FeatureAgglomeration(n_clusters=250) print np.shape(X) y = matrix[:,-1] X = FA.fit_transform(X,y) n_samples, n_features = X.shape k = int(0.8*n_samples) #random_state = np.random.RandomState(0) #X = np.c_[X, random_state.randn(n_samples, 2*n_features)] X_test, y_test = X[k:,:], y[k:] X, y = X[:k, :], y[:k] sm = SMOTE(ratio=0.95) X,y = sm.fit_sample(X, y) print np.shape(X) start = time.time() ###############################################################################
lecs=data.iloc[:,:-6] from sklearn.cluster import FeatureAgglomeration agglo = FeatureAgglomeration(n_clusters=15) # In[ ]: agglo.fit(lecs) # In[ ]: lecs_reduced=agglo.fit_transform(lecs) # In[ ]: lecs_reduced.shape # In[ ]: from sklearn.decomposition import PCA, KernelPCA # In[ ]:
plt.ylabel('Y') plt.title('AgglomerativeClustering', fontdict=dict(size=20, color='r')) model4 = Birch(threshold=0.5, branching_factor=50, n_clusters=4) model4.fit(x) print('\nBirch:') print(model4.subcluster_centers_.shape) ypred4 = model4.predict(x) plt.figure(figsize=(12, 8)) plt.scatter(x[:,0], x[:,1], c=ypred4, cmap='Spectral') plt.xlabel('X') plt.ylabel('Y') plt.title('Hierarchical_Birch', fontdict=dict(size=20, color='r')) model5 = FeatureAgglomeration(n_clusters=2, affinity='euclidean', linkage='complete') # dimensionality reduction x_new = model5.fit_transform(x1) model6 = KMeans(n_clusters=4, max_iter=300, tol=0.0001, verbose=0, random_state=1, n_jobs=4) ypred6 = model6.fit_predict(x_new) plt.figure(figsize=(12, 8)) plt.scatter(x_new[:, 0], x_new[:, 1], c=ypred6, cmap='coolwarm') plt.xlabel('X') plt.ylabel('Y') plt.title('FeatureAgglomeration_KMeans', fontdict=dict(size=20, color='r')) model7 = DBSCAN(eps=0.8, min_samples=5, metric='euclidean', leaf_size=30, n_jobs=4) ypred7 = model7.fit_predict(x) plt.figure(figsize=(12, 8)) plt.scatter(x[:, 0], x[:, 1], c=ypred7, cmap='seismic') plt.xlabel('X') plt.ylabel('Y') plt.title('DBSCAN', fontdict=dict(size=20, color='r'))
def ft_red_select(x, y, choice, no_normalize, dis_kept_features, num_features=30): """ :param 'full_file_name', which is the full path name to the file in question that we wish to do dimensionality reduction on :return: the new reduced 'x' and 'y' components of the file to be later written to a new file """ #Normalize the data if not no_normalize: x = normalize(x) #Given the argument choice of feature selection/reduction, creates the relevant object, fits the 'x' data to it, #and reduces/transforms it to a lower dimensionality new_x = [] print("Original 'x' shape:", np.shape(x)) if choice == "pca": pca = PCA(n_components=num_features) new_x = pca.fit_transform(x) print("Explained variance = " + str(round(sum(pca.explained_variance_) * 100, 2)) + "%") elif choice == "grp": grp = GaussianRandomProjection(n_components=num_features) new_x = grp.fit_transform(x) elif choice == "agglom": agg = FeatureAgglomeration(n_clusters=num_features) new_x = agg.fit_transform(x) elif choice == "thresh": #Below threshold gives ~26 components upon application vt = VarianceThreshold(threshold=0.00015) new_x = vt.fit_transform(x) print("Explained variance = " + str(round(sum(vt.variances_) * 100, 2)) + "%") kept_features = list(vt.get_support(indices=True)) if dis_kept_features: print("Kept features: ") for i in kept_features: print(col_names[i]) elif choice == "rf": y_labels = [1 if s == "D" else 0 for s in y[:, 1]] clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) print("Fitting RF model....") clf.fit(x, y_labels) sfm = SelectFromModel(clf, threshold=-np.inf, max_features=num_features) print("Selecting best features from model...") sfm.fit(x, y_labels) kept_features = list(sfm.get_support(indices=True)) if dis_kept_features: print("Kept features: ") for i in kept_features: print(col_names[i]) new_x = x[:, kept_features] print("Reduced 'x' shape:", np.shape(new_x)) return new_x, y
error_rate_test_1 = np.zeros(np.shape(data1_X_train)[1]) DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth=None) error_rate_train_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0] print "error_rate_train_DT_1", error_rate_train_DT_1 error_rate_test_DT_1 = sum( DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0] print "error_rate_test_DT_2", error_rate_test_DT_1 for i in range(0, np.shape(data1_X_train)[1]): print i start_time = time.time() fa.set_params(n_clusters=i + 1) data1_X_train_fa = fa.fit_transform(data1_X_train) data1_X_test_fa = fa.transform(data1_X_test) error_rate_train_1[i] = sum( DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_train_fa) == data1_y_train) * 1.0 / \ data1_y_train.shape[0] print("error_rate_train_1[%f]" % i), error_rate_train_1[i] error_rate_test_1[i] = sum( DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_test_fa) == data1_y_test) * 1.0 / \ data1_y_test.shape[0] print("error_rate_test_1[%f]" % i), error_rate_test_1[i] print "time consumed:", time.time() - start_time file_2.write("FA_error_rate_train_1") for i in range(0, len(error_rate_train_1)): file_2.write(";")
def FA_reduced(X_train): fa = FeatureAgglomeration(n_clusters=10) X_train_reduced = fa.fit_transform(X_train) return X_train_reduced
def train_and_test(alpha, predictors, predictor_params, x_filename, y_filename, n_users, percTest, featureset_to_use, diff_weighting, phi, force_balanced_classes, do_scaling, optimise_predictors, report, conf_report=None): # all_X = numpy.loadtxt(x_filename, delimiter=",") all_X = numpy.load(x_filename + ".npy") all_y = numpy.loadtxt(y_filename, delimiter=",") print("loaded X and y files", x_filename, y_filename) if numpy.isnan(all_X.any()): print("nan in", x_filename) exit() if numpy.isnan(all_y.any()): print("nan in", y_filename) exit() #print("selecting balanced subsample") print("t t split") X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=percTest, random_state=666) # feature extraction # test = SelectKBest(score_func=chi2, k=100) # kb = test.fit(X_train, y_train) # # summarize scores # numpy.set_printoptions(precision=3) # print(kb.scores_) # features = kb.transform(X_train) # mask = kb.get_support() # # summarize selected features # print(features.shape) # X_train = X_train[:,mask] # X_test = X_test[:,mask] scaler = StandardScaler() rdim = FeatureAgglomeration(n_clusters=100) if do_scaling: # input(X_train.shape) X_train = rdim.fit_transform(X_train) X_test = rdim.transform(X_test) X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) with open('../../../isaac_data_files/qutor_scaler.pkl', 'wb') as output: pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL) with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output: pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL) # print("feature reduction...") # pc = PCA(n_components=100) # X_train = pc.fit_transform(X_train) # X_test = pc.transform(X_test) classes = numpy.unique(y_train) sample_weights = None if (force_balanced_classes): X_train, y_train = balanced_subsample(X_train, y_train, 1.0) #0.118) print("X_train shape:", X_train.shape) print("X_test shape:", X_test.shape) print("tuning classifier ...") for ix, p in enumerate(predictors): print(type(p)) print(p.get_params().keys()) if optimise_predictors == True and len(predictor_params[ix]) > 1: pbest = run_random_search(p, X_train, y_train, predictor_params[ix]) else: pbest = p.fit(X_train, y_train) predictors[ix] = pbest print("pickling classifier ...") for ix, p in enumerate(predictors): p_name = predictor_params[ix]['name'] with open( '../../../isaac_data_files/p_{}_{}_{}.pkl'.format( p_name, alpha, phi), 'wb') as output: pickle.dump(p, output, pickle.HIGHEST_PROTOCOL) print("done!") # report.write("* ** *** |\| \` | | |) /; `|` / |_| *** ** *\n") # report.write("* ** *** | | /_ |^| |) || | \ | | *** ** *\n") #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n") for ix, p in enumerate(predictors): report.write(",".join( map(str, (all_X.shape[0], str(p).replace(",", ";").replace( "\n", ""), force_balanced_classes, diff_weighting, alpha, phi, do_scaling)))) y_pred_tr = p.predict(X_train) y_pred = p.predict(X_test) # for x,y,yp in zip(X_train, y_test, y_pred): if conf_report: conf_report.write( str(p).replace(",", ";").replace("\n", "") + "\n") conf_report.write(str(alpha) + "," + str(phi) + "\n") conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n") conf_report.write("\n") # p = precision_score(y_test, y_pred, average=None, labels=classes) # r = recall_score(y_test, y_pred, average=None, labels=classes) # F = f1_score(y_test, y_pred, average=None, labels=classes) p, r, F, s = precision_recall_fscore_support(y_test, y_pred, labels=classes, average=None, warn_for=('precision', 'recall', 'f-score')) avp, avr, avF, _ = precision_recall_fscore_support( y_test, y_pred, labels=classes, average='weighted', warn_for=('precision', 'recall', 'f-score')) for ix, c in enumerate(classes): report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix], s[ix])) report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s))) # report.write(classification_report(y_test, y_pred)+"\n") # report.write("------END OF CLASSIFIER------\n") report.flush() return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
def validate_spectral_clusters(clusterCenters, labels, originalData, nEigenVectors, partitions=4, dimRedMethod=None, trials=100): """ Computes the cluster centroids from the given dataset and clustering labels. Params: clusterCenters: ndarray of shape (n_clusters, n_features) - cluster centers as assigned by the algorithm which needs to be validated labels: ndarray of shape (n_samples,) - labels assigned by the clustering algorithm to each household originalData: ndarray (shape determined by the problem) - original data which was used for pre-processing followed by dimensionality reduction before passing onto for final clustering - this will be passed straight to the methods: - preProcessing_clustering:: to get ndarray of shape (n_samples, n_features) - shuffle_partition:: to get list of arrays similar to originalData nEigenVectors: int - number of eigenvectors used during spectral clustering partitions: int (>1) - number of partitions (of the original data) to be studied dimRedMethod: 'FA' or 'PCA' or None - Dimensionality reduction method which was used post pre-processing trials: int (>=1) - Number of times partitioning is done before averaging out the results Returns: totalCases: int - total number of cases for which match/mis-match is calculated nMatchAvg: float - average number of matches across trials nMisMatchAvg: float - average number of mis-matches across trials percentMatch: float - match% obtained across trials percentMisMatch: float - mis-match% obtained across trials sampleMisMatchFreq: ndarray of shape (n_samples,) - Average number of mis-matches obatined for each sample after all trials - Note: In each trial, number of times a match/mis-match is calculated for a particular sample is equal to the number of partitions studied during the validation """ nClusters = clusterCenters.shape[0] nComponents = clusterCenters.shape[1] sampleMisMatchFreq = np.zeros((len(labels))) nMatchAvg = 0 nMisMatchAvg = 0 for trial in range(trials): # Shuffle and Partition the data partitionedData = shuffle_partition(originalData, partitions) nMatch = 0 nMisMatch = 0 for i in range(len(partitionedData)): # Perform pre-processing routines on each partitions # This variables shape must be (n_samples, n_features) processedData = preProcessing_clustering(partitionedData[i]) # Perform dimensionality reduction on pre-processed partitions if dimRedMethod == None: processedData_reduced = processedData elif dimRedMethod == 'PCA': if nComponents == None: raise ValueError( "nComponents cannot be None when dimRedMethod is not None." ) pca = PCA(n_components=nComponents) processedData_reduced = pca.fit_transform(processedData) elif dimRedMethod == 'FA': if nComponents == None: raise ValueError( "nComponents cannot be None when dimRedMethod is not None." ) agglo = FeatureAgglomeration(n_clusters=nComponents) processedData_reduced = agglo.fit_transform(processedData) else: raise ValueError( "dimRedMethod should either be 'PCA' or 'FA' or None - found something else." ) # Perform spectral clustering's internal processing steps before # internal implementation of K-Means algorithm _, U, _ = spectralClustering_KM_KNN_Euc( processedData_reduced, nClusters, nEigenVectors=nEigenVectors) # Check which cluster is nearest to the newly obtained vector # representaions of the same sample. Note: corresponding to each # sample, each partition specifies a new representation of the sample. # In other words, original sample is divided into n (n = number of # partitions) representations of itself. for sample in range(len(processedData_reduced)): temp = np.argmin( np.linalg.norm(clusterCenters - U[sample].reshape(1, -1), axis=1)) if temp == labels[sample]: nMatch += 1 else: nMisMatch += 1 sampleMisMatchFreq[sample] += 1 nMatchAvg = ((nMatchAvg * trial) + nMatch) / (trial + 1) nMisMatchAvg = ((nMisMatchAvg * trial) + nMisMatch) / (trial + 1) totalCases = int(np.round(nMatchAvg + nMisMatchAvg)) percentMatch = (nMatchAvg * 100) / totalCases percentMisMatch = (nMisMatchAvg * 100) / totalCases return totalCases, nMatchAvg, nMisMatchAvg, percentMatch, percentMisMatch, sampleMisMatchFreq