def IrisMatchingRed(train_features, train_classes, test_features, test_classes, n): train_redfeatures = train_features.copy() test_redfeatures = test_features.copy() total = float(len(test_classes)) if n < 108: lda = LinearDiscriminantAnalysis(n_components=n) lda.fit(train_features, train_classes) train_redfeatures = lda.transform(train_features) test_redfeatures = lda.transform(test_features) if n >= 108 and n < 323: lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) l1knn = KNeighborsClassifier(n_neighbors=1, metric='l1') l1knn.fit(train_redfeatures, train_classes) l1classes = l1knn.predict(test_redfeatures) l1crr = float(np.sum(l1classes == test_classes)) / total l2knn = KNeighborsClassifier(n_neighbors=1, metric='l2') l2knn.fit(train_redfeatures, train_classes) l2classes = l2knn.predict(test_redfeatures) l2crr = float(np.sum(l2classes == test_classes)) / total cosknn = KNeighborsClassifier(n_neighbors=1, metric='cosine') cosknn.fit(train_redfeatures, train_classes) cosclasses = cosknn.predict(test_redfeatures) coscrr = float(np.sum(cosclasses == test_classes)) / total # table_CRR() return l1crr, l2crr, coscrr
def IrisMatchingBootstrap(train_features, train_classes, test_features, test_classes, times, thresholds): total_fmrs = [] total_fnmrs = [] total_crr = np.zeros(times) lle = LocallyLinearEmbedding(n_neighbors=201, n_components=200) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) for t in range(times): tests_features, tests_classes = selectTestSample( test_redfeatures, test_classes) crr, distm, distn = IrisMatching(train_redfeatures, train_classes, tests_features, tests_classes, 3) fmrs, fnmrs = calcROC(distm, distn, thresholds) total_fmrs.append(fmrs) total_fnmrs.append(fnmrs) total_crr[t] = crr total_fmrs = np.array(total_fmrs) total_fnmrs = np.array(total_fnmrs) crr_mean = np.mean(total_crr) crr_std = np.std(total_crr) crr_u = min(crr_mean + crr_std * 1.96, 1) crr_l = crr_mean - crr_std * 1.96 return total_fmrs, total_fnmrs, crr_mean, crr_u, crr_l
class LLEClassifier(BaseEstimator): def __init__(self, n_neighbors=5, n_components=2, n_clusters=2, reg=0.001, method='standard', eigen_solver='auto', random_state=3319): self.n_neighbors = n_neighbors self.n_components = n_components self.n_clusters = n_clusters self.reg = reg self.method = method self.eigen_solver = eigen_solver self.random_state = random_state def fit(self, X, y): #creating a manifold on training data self.model = LocallyLinearEmbedding( method=self.method, n_neighbors=self.n_neighbors, n_components=self.n_components, reg=self.reg, eigen_solver=self.eigen_solver, random_state=self.random_state).fit(X, y) #determining centroids for given points self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit( self.model.transform(X)) labels = self.centroids.predict(self.model.transform( X)) # Every point is assigned to a certain cluster. #assigning each centroid to the correct cluster confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute( cost_m) # (target, cluster) assignment pairs. #saving mapping for predictions self.mapping = { cluster: target for target, cluster in dict(target_cluster).items() } def predict(self, X_test): #transforming test set using manifold learning method X_trans = self.model.transform(X_test) #assigning each of the points to the closest centroid labels = self.centroids.predict(X_trans) y_pred = list(map(self.mapping.get, labels)) return y_pred
def classify_concat_lle_data(self, vis_data, sem_data, labels): fold = 0 accuracies = [] lle = LocallyLinearEmbedding(n_components=sem_data.shape[1], n_neighbors=20) skf = StratifiedKFold(n_splits=self.n_folds, random_state=None, shuffle=True) for train_index, test_index in skf.split(vis_data, labels): logging.info('Running LLE classification for fold %d' % fold) tr_vis = normalize(vis_data[train_index], norm='l2', axis=1, copy=True) te_vis = normalize(vis_data[test_index], norm='l2', axis=1, copy=True) tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True) te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True) te_sem = SemanticDegradation.kill_semantic_attributes( te_sem, self.degradation_rate) te_sem = normalize(te_sem, norm='l2', axis=1, copy=True) tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack( (te_vis, te_sem)) tr_labels, te_labels = labels[train_index][:, 0], labels[ test_index][:, 0] clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear')) lle.fit(tr_data) clf.fit(lle.transform(tr_data), tr_labels) prediction = clf.predict(lle.transform(te_data)) fold += 1 accuracies.append(balanced_accuracy_score(te_labels, prediction)) return accuracies
def data_transform(train, test): pca = LocallyLinearEmbedding(n_components=80, n_neighbors=60) train_tran = pca.fit_transform(train[:, :-1]) test_tran = pca.transform(test[:, :-1]) train_cat = np.hstack((train_tran, train[:, -1].reshape((-1, 1)))) test_cat = np.hstack((test_tran, test[:, -1].reshape((-1, 1)))) #print("explained variance ratio: %s" % str(pca.lambdas_)) pass return train_cat, test_cat
def embed_lle(train, test, nn=10, method='standard'): traintest = np.concatenate((train, test)) from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=nn, n_components=2, method=method) lle.fit(traintest) X2d = lle.transform(traintest) X2d = MinMaxScaler().fit_transform(X2d) return X2d[:train.shape[0]], X2d[train.shape[0]:]
def evaluate_fold(model, X_te, Y_te, X_tr, Y_tr): from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.manifold import LocallyLinearEmbedding # pca = PCA(n_components=int(X_tr.shape[1] / 10)).fit(X_tr) n_components = int(X_tr.shape[1] / 10) pca = LocallyLinearEmbedding(n_components=n_components, n_neighbors=(n_components + 1), method='modified').fit(X_tr) X_tr = pca.transform(X_tr) scaler = StandardScaler().fit(X_tr) X_tr_scaled = scaler.transform(X_tr) model.fit(X_tr_scaled, Y_tr) X_te = pca.transform(X_te) X_te_scaled = scaler.transform(X_te) Y_pred = model.predict(X_te_scaled) model_metrics = compute_metrics(Y_pred, Y_te) return model_metrics
def IrisMatchingRed1(train_features, train_classes, test_features, test_classes, n): train_redfeatures = train_features.copy() test_redfeatures = test_features.copy() total = float(len(test_classes)) if n < 108: lda = LinearDiscriminantAnalysis(n_components=n) lda.fit(train_features, train_classes) train_redfeatures = lda.transform(train_features) test_redfeatures = lda.transform(test_features) if n >= 108 and n < 323: lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) model = SVC(kernel='rbf') model.fit(train_redfeatures, train_classes) modelclasses = model.predict(test_redfeatures) modelcrr = float(np.sum(modelclasses == test_classes)) / total return modelcrr
def LLE10FoldClf(X, y, nclf): acc = [] kf = KFold(X.shape[0], n_folds=10, shuffle=True) i = 0 for train_index, test_index in kf: yTest = y[test_index] yTrain = y[train_index] n_neighbors = 30 clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') clf.fit(X[train_index]) newRepTrain = clf.transform(X[train_index]) newRepTest = clf.transform(X[test_index]) # NN = neighbors.KNeighborsClassifier(n_neighbors=2) nclf.fit(newRepTrain, yTrain) XPred = nclf.predict(newRepTest) acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0]) # print i,":",acc[i] i += 1 return np.mean(acc), np.std(acc)
def runLLE(X_train, X_test, y_train, y_test, comp_range, n_neigh): rbf_scores = [] linear_scores = [] for n_comp in comp_range: print("\nn_comp=%d\n" % (n_comp)) # transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, eigen_solver='dense', n_jobs=8) transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, n_jobs=8) transformer.fit(X_train) X_train_proj = transformer.transform(X_train) X_test_proj = transformer.transform(X_test) if n_comp == 2: np.save('X_train_proj_2d_LLE_' + str(n_neigh), X_train_proj) np.save('X_test_proj_2d_LLE_' + str(n_neigh), X_test_proj) score_rbf = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, SVMmodel.getBestParam('rbf'), 'rbf') rbf_scores.append(score_rbf.mean()) score_linear = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, SVMmodel.getBestParam('linear'), 'linear') linear_scores.append(score_linear.mean()) for i, scores in enumerate([rbf_scores, linear_scores]): if i == 0: kernel = 'rbf' elif i == 1: kernel = 'linear' else: kernel = '' bestIdx = np.argmax(scores) bestNComp = comp_range[bestIdx] bestAcc = scores[bestIdx] with open('res_LLE_' + kernel + '_' + str(n_neigh) + '.txt', 'w') as f: for j in range(len(comp_range)): f.write(kernel + ": n_comp = %f, acc = %f\n" % (comp_range[j], scores[j])) f.write(kernel + ": Best n_comp = %f\n" % (bestNComp)) f.write(kernel + ": acc = %f\n" % (bestAcc)) return rbf_scores, linear_scores
class _LocallyLinearEmbeddingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def main(args=None): phase = "LLE" random.seed(SEED) np.random.seed(SEED) x, y = load_data(DATAPATH) y = np.asarray([ord(l) - 65 for l in y]) # train data will be used for fitting x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=SEED) # MODELPATH = "./model/pca_" + str(K) + "D.pt" PLOTPATH = "./plot/lle_" + str(K) + "D.png" lle = LocallyLinearEmbedding(n_components=K) lle.fit(x) # <- train data is used for fitting x_transformed = lle.transform(x) c = np.asarray(COLORS)[y] # <- define corresponding colors s = np.asarray([2 for _ in range(N_SAMPLE)]) # <- define corresponding data point sizes if K == 2: # number of components = 2 (plot 2D) for i in range(N_CLASS): indices = np.asarray([idx for idx, y_ in enumerate(y) if y_==i]) plt.scatter(x_transformed[indices, 0], x_transformed[indices, 1], label= (chr(i + 65)), s=s[indices], c=c[i]) elif K == 3: # number of components = 3 (plot 3D) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i in range(N_CLASS): indices = np.asarray([idx for idx, y_ in enumerate(y) if y_ == i]) ax.scatter(x_transformed[indices, 0], x_transformed[indices, 1], x_transformed[indices, 2], label= (chr(i + 65)), s=s[indices], c=c[i], marker='.') else: raise NotImplementedError plt.legend(title="Classes", scatterpoints=1, loc='best',ncol=4, fontsize=8, markerscale=3) plt.title(phase) plt.savefig(PLOTPATH) plt.show()
def LLE(train_img, train_label, img, n_components): """ It transforms the feature vector to one in a low-dimensional feature space. :param train_img: feature vector of training images :param train_label: labels of training images :param img: feature vector of images to be transformed :param n_components: dimension of the new transformed feature vector :return: transformed feature vecter """ embedding = LocallyLinearEmbedding(n_neighbors=201, n_components=n_components) embedding.fit(train_img, train_label) img_t = embedding.transform(img) return img_t
def preprocess(x_train: np.ndarray, y_train: np.ndarray, x_test: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ Prepocesses data. :param x_train: the training data. :param y_train: the training labels. :param x_test: the test data. :return: Preprocessed x_train and x_test. """ logger.log('Prepocessing...') # Scale data. logger.log('\tScaling data with params:') scaler = MinMaxScaler() logger.log('\t{}'.format(scaler.get_params())) x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # Apply LLE. logger.log('\tApplying LLE with params:') embedding = LocallyLinearEmbedding(n_neighbors=100, n_jobs=-1, random_state=0) embedding_params = embedding.get_params() logger.log('\t' + str(embedding_params)) x_train = embedding.fit_transform(x_train) x_test = embedding.transform(x_test) # Plot the graph embedding result. if PLOTTING_MODE != 'none': plotter.subfolder = 'graphs/LLE' plotter.filename = 'embedding' plotter.xlabel = 'first feature' plotter.ylabel = 'second feature' plotter.title = 'LLE' plotter.scatter(x_train, y_train, class_labels=helpers.datasets.get_gene_name) return x_train, x_test
X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=x_columns) # separate the data into training and testing np.random.seed(1) test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False) train_idx = np.array(list(set(X.index.values) - set(test_idx))) # train a LocallyLinearEmbedding model n_comp = 1 # number of components component = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=5, n_jobs=1, random_state=42) component.fit(X.iloc[train_idx, :]) # compute components for all the data, add cluster labels and train/test labels components = pd.DataFrame(component.transform(X), columns=["LC" + str(i + 1) for i in range(n_comp)]) components["Data"] = "Train" for j in test_idx: components.loc[j, "Data"] = "Test" # components.to_csv("lle.csv", index=False) # combine the data and components data = pd.concat([X, components], axis=1) # plot correlations corr_plot(data.drop(columns="Data")) # train a random forest to learn the clusters model = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_leaf=5, max_features="sqrt",
xs = np.linspace(0, 10, 1000) zs = np.sin(xs) ys = np.random.random(1000) ax = plt.axes(projection='3d') plt.figure(figsize=(20, 10)) ax.scatter(xs=xs[:300], ys=ys[:300], zs=zs[:300]) ax.scatter(xs=xs[300:600], ys=ys[300:600], zs=zs[300:600]) ax.scatter(xs=xs[600:], ys=ys[600:], zs=zs[600:]) plt.show() x = np.vstack((xs, ys, zs)).T #sklearn用法 n = 50 #近邻数量 lle = LocallyLinearEmbedding(n_neighbors=n, n_components=2, method='standard') lle.fit(x) tranx = lle.transform(x) #画图 print(n) plt.scatter(tranx[:300, 0], tranx[:300, 1]) plt.scatter(tranx[300:600, 0], tranx[300:600, 1]) plt.scatter(tranx[600:, 0], tranx[600:, 1]) plt.show() #自编用法 m, n = np.shape(x) #1、计算W k = 50 #近邻数量 W = np.zeros((m, m)) for i in range(m): n_distance = np.zeros((m)) xi = x[i, :]
from sklearn.manifold import LocallyLinearEmbedding from astroML.datasets import fetch_sdss_specgals from astroML.datasets import fetch_sdss_spectrum data = fetch_sdss_specgals() print data.dtype.names ngals = 326 nwavel = 3855 plates = data['plate'][:ngals] mjds = data['mjd'][:ngals] fiberIDs = data['fiberID'][:ngals] h_alpha = data['h_alpha_flux'][:ngals] bptclass = data['bptclass'][:ngals] specdata = np.zeros((ngals, nwavel)) i = 0 for plate, mjd, fiberID in zip(plates, mjds, fiberIDs): tempdata = fetch_sdss_spectrum(plate, mjd, fiberID) specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean() i += 1 # Apply LLE k = 7 for fignum, n in enumerate([2, 3]): lle = LocallyLinearEmbedding(k, n) lle.fit(specdata) proj = lle.transform(specdata) pl.subplot(2, 1, fignum+1) pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50) pl.colorbar() pl.show()
from sklearn.manifold import LocallyLinearEmbedding n_neighbors = 10 n_components = 2 method = 'modified' n_jobs = 4 random_state = 2018 lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method=method, random_state=random_state, n_jobs=n_jobs) lle.fit(X_train.loc[0:5000, :]) X_train_lle = lle.transform(X_train) X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index) X_validation_lle = lle.transform(X_validation) X_validation_lle = pd.DataFrame(data=X_validation_lle, index=validation_index) scatterPlot(X_train_lle, y_train, "Locally Linear Embedding") # In[ ]: # t-SNE from sklearn.manifold import TSNE n_components = 2 learning_rate = 300 perplexity = 30
nLocally_Linear = np.arange(20, 200, 20) data = {} for k in nLocally_Linear: features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl") features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42) t0 = time() ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto') ll.fit(features_train) print ("Dimension Reduction time:", round(time()-t0, 3), "s") features_train = ll.transform(features_train) features_test = ll.transform(features_test) for name, clf in [ ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')), ('BernoulliNB', BernoulliNB(alpha=1)), ('GaussianNB', GaussianNB()), ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)), ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')), ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)), ('SVC', SVC(kernel='linear', C=1)) ]: if not data.has_key(name): data[name] = []
def train_NN_LLE(filename, X_train, X_test, y_train, y_test, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pNN={}, nolegend=False, random_seed=1, num_dim=4): np.random.seed(random_seed) algo = 'LLE' + str(num_dim) start = time.time() lle = LocallyLinearEmbedding(n_neighbors=10, n_components=num_dim, random_state=random_seed, n_jobs=-1) lle.fit(X_train) X_train = lle.transform(X_train) X_test = lle.transform(X_test) param_grid = [{ 'hidden_layer_sizes': [(512, 512, 512, 512)], 'activation': ['relu'], # 'identity', 'solver': ['adam'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'batch_size': ['auto'], 'learning_rate_init': [0.001, 0.01], 'max_iter': [10000], 'warm_start': [True], 'early_stopping': [True], 'random_state': [1] }] nn_classifier = MLPClassifier() grid_search = GridSearchCV(nn_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4] + '-' + str(num_dim), scalar, '') start = time.time() nn_classifier.fit(X_train, y_train) print('NN Fit Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('NN Train Score Time: ', train_score, time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('NN Test Score Time: ', test_score, time.time() - start) test_class = MLPClassifier() test_class.set_params(**pNN) if make_graphs: # computer Model Complexity/Validation curves util.plot_learning_curve(nn_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)
def localLinearEmbedding(X, y): lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense") lle.fit(X) transformX = lle.transform(X) return transformX
fig = plt.figure(figsize=(6, 4)) axes3D = Axes3D(fig) axes3D.scatter3D(gm_X[:, 0], gm_X[:, 1], gm_X[:, 2], marker='o', c=gm_colors[gm_y]) plt.scatter(gm_centers[:, 0], gm_centers[:, 1], gm_centers[:, 2], marker='x', c='r') plt.title("Orignal Axis Dist with Class Label.(First 3 dims)") plt.show() ############# perform algrithom ############# gm_lle = LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard', n_jobs=2, random_state=9) gm_lle.fit(gm_X) gm_S = gm_lle.transform(gm_X) gm_Scenters = gm_lle.transform(gm_centers) plt.scatter(gm_S[:, 0], gm_S[:, 1], marker='o', c=gm_colors[gm_y]) plt.scatter(gm_Scenters[:, 0], gm_Scenters[:, 1], marker='x', c='r') plt.title("LDA Axis Dist.( 2 dims)") plt.show()
def main(): parser = argparse.ArgumentParser(description= 'Perform Dimensionality Reduction') parser.add_argument('--alg', type=str, default='MLLE', help='Algorithm to reduce dimensionality.') parser.add_argument('catalog', type=str, help='Specify the catalog on which to perform DimReduce.') args = parser.parse_args() #dat = Table.read('catalogs/ZEST_catalog_colors.fits') #training_sample = dat[0:10000] #testing_sample = dat[10001:20000] #zkeys = ['cc', 'aa', 'm20', 'gg'] base = os.path.basename(args.catalog) filename = os.path.splitext(base)[0] dat = Table.read(args.catalog) mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']# #dat.remove_column('color') if 'color' not in dat.colnames: if 'kaggle' in sample: dat = prep_catalog.color_data2(dat, 'gz2class') if 'direct' in sample: dat = prep_catalog.color_data(dat, 'zclass') dat.write(args.catalog, overwrite=True) #dat = prep_catalog.adjust_asym(dat, mkeys[2]) #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys) n_neighbors = [10,12,15,20] #n_neighbors = [7] n_components = 3 for i, n_neigh in enumerate(n_neighbors): if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']: if args.alg == 'MLLE': method = 'modified' elif args.alg == 'LLE': method = 'standard' elif args.alg == 'LTSA': method = 'ltsa' elif args.alg == 'HLLE': method = 'hessian' #replace_panoptes(dat) #pdb.set_trace() #sample = 'directbig_panoptes' X, y = prep_catalog.whiten_data(dat, mkeys) (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], random_state=0) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.35], random_state=0) y_train = simplify_classlabels(y_train) y_test = simplify_classlabels(y_test) #filename = 'modified_7_directbig_new' X_train = X y_train = simplify_classlabels(y) #''' #sample ='direct_zcut' #Y_train, Y_test = open_previous_LLE(filename) #cut = np.where(X1['REDSHIFT'] <= 0.05) #X1_cut = X1[cut] #QC_plots(X1_cut) #Y_train = np.array(Y_train)[cut] #col_train = np.array(col_train)[cut] #X = Table(X) #cut_out_mixedup_region(X, np.array(Y_train)) #''' print "performing "+method+" LLE with",n_neigh,\ "nearest neighbors" print "on training sample of",len(X_train),"objects" t0 = time() A = LLE(n_neigh, n_components, eigen_solver='auto', method=method) error = A.fit(X_train).reconstruction_error_ Y_train = A.fit_transform(X_train) Y_test = A.transform(X_train) t1 = time() #''' metadata = {'method':method, 'N':n_neigh, 'd':n_components, 'error':error, 'time':t1-t0, 'sample':filename+'_total'} save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total') #metadata = {'method':method, 'N':n_neigh, 'd':n_components, # 'error':error, 'time':t1-t0, 'sample':filename+'_test'} #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test') # plot in 3D plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], method, n_neigh, error, t1-t0, filename, two=False) #====================================================================# elif args.alg == 'ISO': method='IsoMap' print "performing IsoMap with",n_neigh,"nearest neighbors" print "on training sample of",len(dat),"objects" t0 = time() A = Isomap(n_neigh, n_components, eigen_solver='dense') error = A.fit(train).reconstruction_error() Y = A.fit_transform(train) #Y2 = A.transform(test) t1 = time() print "%s: %.2g sec" %(args.alg, t1-t0) print "reconstruction error: ", error print "begin plotting" plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2) plot_dimreduce_3D(Y, traincols, Y, traincols, method, n_neigh, (t1-t0), error, sample) elif args.alg == 'LDA': print "performing LDA" X, Xc, y = prep_catalog.whiten_data(dat, mkeys) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) DRclf = LDA(3, priors=None) #DRclf.fit(X_train, y_train) DRtrain = DRclf.fit(X_train, y_train).transform(X_train) DRtest = DRclf.fit(X_train, y_train).transform(X_test) classes = np.unique(y_train) colors = np.array(['darkred', 'red', 'lightsalmon', 'darkgreen', 'lightgreen', 'lightseagreen', 'indigo', 'darkviolet', 'plum']) plot_LDA_3D(DRtrain, y_train, classes, colors, sample) pdb.set_trace() #classifiers = [] #predictions = [] #Nparams = np.arange(1, X.shape[1]+1) #for nc in Nparams: clf = LDA() clf.fit(DRtrain, y_train) y_pred = clf.predict(DRtest) matchesLDA = (y_pred == y_test) print np.sum(matchesLDA) pdb.set_trace() #------------------------------------------ from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier(5) knc.fit(DRtrain, y_train) y_pred = knc.predict(DRtest) matchesKNN = (y_pred == y_test) print np.sum(matchesKNN) pdb.set_trace() #------------------------------------------ from astroML.classification import GMMBayes gmmb = GMMBayes(9) gmmb.fit(DRtrain, y_train) y_pred = gmmb.predict(DRtest) matchesGMMB = (y_pred == y_test) print np.sum(matchesGMMB) pdb.set_trace() #------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) pdb.set_trace() im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, s=4, lw=0) #cmap=plt.cm.binary,, zorder=2 im.set_clim(-0.5, 1) #im = ax.imshow(Z, origin='lower', aspect='auto', # cmap=plt.cm.binary, zorder=1, # extent=xlim + ylim) #im.set_clim(0, 1.5) #ax.contour(xx, yy, Z, [0.5], colors='k') #ax.set_xlim(xlim) #ax.set_ylim(ylim) ax.set_xlabel('$G$') ax.set_ylabel('$M20$') #pred, true = classification_loss(predictions, y_test) #completeness, contamination = completeness_contamination(pred, true) pdb.set_trace() #''' #t0 = time() #A = LDA(n_components, priors=None) #Y = A.fit_transform(train, targets) #Y2 = A.fit(train, targets).transform(train) #t1 = time() #print "%s: %.2g sec" %(args.alg, t1-t0) predict = A.predict(train) #print "Predicted classes:", predict #pdb.set_trace() #pdb.set_trace() #''' plot_LDA_3D(Y2, targets, classes, colors, sample) plot_LDA(Y2, targets, classes, colors, sample, axis=0) plot_LDA(Y2, targets, classes, colors, sample, axis=1) plot_LDA(Y2, targets, classes, colors, sample, axis=2) pdb.set_trace()
class Model(nn.Module): def __init__(self, args): super(Model, self).__init__() self.temperature = args.temperature self.base = resnet12() self.nFeat = self.base.nFeat self.clasifier = nn.Conv2d(self.nFeat, args.num_classes, kernel_size=1) self.args = args if (args.method in {'CBM', 'CBM_LLE'}): with open(osp.join(args.save_dir, 'base_proto.pickle'), 'rb') as fo: self.base_proto = pickle.load(fo) # [64 512] if (args.method == 'CBM_LLE'): self.LLE = LocallyLinearEmbedding(n_neighbors=args.k, n_components=args.dim) if (args.L2): self.base_proto = F.normalize(self.base_proto, p=2, dim=-1) self.base_proto = torch.from_numpy( self.LLE.fit_transform( self.base_proto.cpu().numpy())).cuda() self.base_proto = self.base_proto.unsqueeze(0) if (self.args.similarityOnBase == 'cosine'): self.base_proto = F.normalize(self.base_proto, p=2, dim=-1) def test(self, ftrain, ftest, batch_size, num_way, num_test): ftrain = ftrain.mean((-1, -2)) ftest = ftest.mean((-1, -2)) phi = self.calPhi(ftrain, ftest, batch_size, num_way, num_test) if (self.args.method in {'CBM', 'CBM_LLE'}): varPhi = self.calVarPhi(ftrain, ftest, batch_size, num_way, num_test) return self.args.alpha * phi + ( 1 - self.args.alpha) * varPhi # [4 30 5] else: return phi def calPhi(self, ftrain, ftest, batch_size, num_way, num_test): ftrain = ftrain.view(batch_size, 1, num_way, -1) ftest = ftest.view(batch_size, num_test, 1, -1) ftrain = F.normalize(ftrain, p=2, dim=-1) ftest = F.normalize(ftest, p=2, dim=-1) scores = torch.sum(ftest * ftrain, dim=-1) # [4 30 5] return scores def calVarPhi(self, ftrain, ftest, batch_size, num_way, num_test): if (self.args.method == 'CBM_LLE'): if (self.args.L2): ftrain = F.normalize(ftrain, p=2, dim=-1) ftest = F.normalize(ftest, p=2, dim=-1) ftrain = torch.from_numpy(self.LLE.transform( ftrain.cpu().numpy())).cuda() ftest = torch.from_numpy(self.LLE.transform( ftest.cpu().numpy())).cuda() ftrain = ftrain.unsqueeze(1) ftest = ftest.unsqueeze(1) if (self.args.similarityOnBase == 'cosine'): ftrain = F.normalize(ftrain, p=2, dim=-1) ftrain = (ftrain * self.base_proto).sum(-1) ftest = F.normalize(ftest, p=2, dim=-1) ftest = (ftest * self.base_proto).sum(-1) else: # Euclidean ftrain = -(ftrain - self.base_proto).norm(dim=-1) ftest = -(ftest - self.base_proto).norm(dim=-1) if (self.args.softmax): ftrain = F.softmax(ftrain, dim=-1) ftest = F.softmax(ftest, dim=-1) if (self.args.similarityOfDistribution == 'cosine'): ftrain = F.normalize(ftrain, p=2, dim=-1).view(batch_size, 1, num_way, -1) ftest = F.normalize(ftest, p=2, dim=-1).view(batch_size, num_test, 1, -1) scores = (ftrain * ftest).sum(-1) elif (self.args.similarityOfDistribution == 'Euclidean'): ftrain = F.normalize(ftrain, p=2, dim=-1).view(batch_size, 1, num_way, -1) ftest = F.normalize(ftest, p=2, dim=-1).view(batch_size, num_test, 1, -1) scores = -(ftrain - ftest).norm(dim=-1) else: # KL ftrain = F.softmax(ftrain, dim=-1).view(batch_size, 1, num_way, -1) ftest = F.softmax(ftest, dim=-1).view(batch_size, num_test, 1, -1).log() scores = -(ftrain * (ftrain.log() - ftest)).sum(dim=-1) return scores def forward(self, xtrain, xtest, ytrain, ytest): batch_size, num_train = xtrain.size(0), xtrain.size(1) num_test = xtest.size(1) num_way = ytrain.size(2) ytrain = ytrain.transpose(1, 2) xtrain = xtrain.view(-1, xtrain.size(2), xtrain.size(3), xtrain.size(4)) xtest = xtest.view(-1, xtest.size(2), xtest.size(3), xtest.size(4)) x = torch.cat((xtrain, xtest), 0) f = self.base(x) ftrain = f[:batch_size * num_train] ftrain = ftrain.view(batch_size, num_train, -1) ftrain = torch.bmm(ytrain, ftrain) ftrain = ftrain.div(ytrain.sum(dim=2, keepdim=True).expand_as(ftrain)) ftrain = ftrain.view(-1, *f.size()[1:]) # [4*5 512 6 6] ftest = f[batch_size * num_train:] ftest = ftest.view(-1, *f.size()[1:]) # [4*30 512 6 6] if not self.training: score = self.test(ftrain, ftest, batch_size, num_way, num_test) # score = score.view(batch_size*num_test, num_way) return score else: ytest = self.clasifier(ftest) * self.temperature # [4*30 64 6 6] return ytest
prediction = clf.predict(X_test) origin_time_end = time.time() acc_origin_space = metrics.accuracy_score(Y_test, prediction) time_elapse = (origin_time_end - origin_time_start) * 1000 print('原始空间的准确率:%.4f, 原始空间数据维度:%d, 耗时:%d ms。' % (acc_origin_space, n_features, time_elapse)) # TODO: 使用lda对数据进行降维 subspace_dim = 56 lle_model = LocallyLinearEmbedding(n_components=subspace_dim, n_neighbors=5, random_state=4399) lle_model.fit(X_train) X_train_new = lle_model.transform(X_train) X_test_new = lle_model.transform(X_test) # TODO: 在子空间上的分类效果 subspace_time_start = time.time() clf_new = KNeighborsClassifier(n_neighbors=5, weights='distance') clf_new.fit(X_train_new, Y_train) prediction_subspace = clf_new.predict(X_test_new) subspace_time_end = time.time() acc_subspace_score = metrics.accuracy_score(Y_test, prediction_subspace) time_elapse = (subspace_time_end - subspace_time_start) * 1000 print('子空间的准确率:%.4f, 子空间数据维度:%d, 耗时:%d ms。' % (acc_subspace_score, subspace_dim, time_elapse))
class Cluster: """ Constructor Initializes the class variables necessary for preprocessing the data """ def __init__(self): self.lle = None self.n_clusters = None self.size = None self.iterations = None self.affinity = ['rbf', 'nearest_neighbors'] """ Run Locally Linear Embedding and Spectral Clustering on the provided data LLE reduces the data to 2D Spectral Clustering runs for n_clusters, default is 2 """ def train(self, x_train, y_train, x_test, y_test, n_clusters=2): # Set number of clusters self.n_clusters = n_clusters # Set the size to the training set size self.size = len(x_train) # Create list with numbers from 1 to number of training items self.iterations = np.zeros(self.size) for i in range(0, self.size): self.iterations[i] = i + 1 # Apply Locally Linear Embedding on training and testing data x_train = self.LLE(x_train) x_test = self.LLE(x_test) # Plot training data self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data') self.SpectralClustering(x_train, y_train) """ Run Spectral Clustering for these data with these parameters affinity=['rbf', 'nearest_neighbors'], Default is rbf kernel for similarity matrix, """ def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors'): # Get similarity matrix for train data if affinity == 'nearest_neighbors': similarity_matrix = self.NNGraph(x_train) else: similarity_matrix = self.SimilarityMatrix(x_train) # Get degree matrix from similarity matrix degree_matrix = self.DegreeMatrix(similarity_matrix) # Get laplacian matrix from similarity matrix and degree matrix #laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix, degree_matrix=degree_matrix) laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=True) y_spec = self.transformDataToLaplacian(laplacian_matrix) model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0) predicted = model.fit(y_spec).labels_ print(predicted) self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title='Custom SpectralClustering') for i in range(0, len(y_train)): if y_train[i] == -1: y_train[i] = 0 print( metrics.precision_recall_fscore_support(y_train, predicted, average='macro')) # Run with sklearns Spectral Clustering #self.SklearnSP(x_train) """ Create the new data using the laplacian matrix and its eigenvalues and eigenvectors """ def transformDataToLaplacian(self, laplacian_matrix): # Get eigenvalues and eigenvectors from the laplacian matrix eigval, eigvec = np.linalg.eig(laplacian_matrix) n_clusters = 5 # Keep the n_clusters smaller eigenvalues sort_ind = np.argsort(eigval)[:n_clusters] # Sort and plot eigenvalues eigval = np.sort(eigval) self.visualize2D(self.iterations, eigval) # Initialize new array for the transormed data transormed_data = np.zeros((len(laplacian_matrix), n_clusters - 1), dtype=np.float64) # Create transformed data for i in range(0, len(laplacian_matrix)): # Ignore first eigenvalue as it is close or equal to 0 for j in range(1, n_clusters): transormed_data[i][j - 1] = eigvec[i, np.asscalar(sort_ind[j])] return transormed_data """ Transform and return data to 2D using LocallyLinearEmbedding """ def LLE(self, data): if self.lle is None: self.lle = LocallyLinearEmbedding(n_components=2) self.lle.fit(data) return self.lle.transform(data) """ Calculate and return the nearest neighbors graph which depicts the distances between each point to another The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix Default limit is 0.4 """ def NNGraph(self, data, limit=0.4): # Create the nearest neighbors graph graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) # A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False) graph = graph.toarray() return graph """ Calculate and return the similarity matrix using the rbf kernel """ def SimilarityMatrix(self, data, limit=0.4): size = len(data) # Initialize array of size x size with zeros similarity_matrix = np.zeros((size, size), dtype=np.float64) for i in range(0, size): for j in range(0, size): if i != j: value = self.rbf(data[i], data[j], 0.5) #if value <= limit: #similarity_matrix[i][j] = value similarity_matrix[i][j] = value return similarity_matrix """ Calculate and return the Degree matrix """ def DegreeMatrix(self, similarity_matrix): size = len(similarity_matrix) # Initialize array of size x size with zeros degree_matrix = np.zeros((size, size), dtype=np.float64) # Calculate sum of every row and set it in the diagonal index = 0 for row in similarity_matrix: sum = 0 for item in row: sum += item degree_matrix[index][index] = sum index += 1 return degree_matrix """ Calculate and return the Laplacian matrix """ def LaplacianMatrix(self, similarity_matrix, degree_matrix): #return degree_matrix - similarity_matrix D = np.zeros(similarity_matrix.shape) w = np.sum(similarity_matrix, axis=0) D.flat[::len(w) + 1] = w**(-0.5) # set the diag of D to w return D.dot(similarity_matrix).dot(D) """ Run sklearn's Spectral Cluster method for comparison """ def SklearnSP(self, x_train): model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity='rbf') model.fit(x_train) y_predict = model.fit_predict(x_train) self.visualize(x_train, y_predict, title='SKLearn SpectralClustering') """ Return exp(−||a − b||^2/s^2) where s = sigma """ def rbf(self, a, b, sigma): #delta = np.array(abs(np.subtract(a, b))) #distance = (np.square(delta).sum()) #c = np.exp(-(distance**2)/(sigma**2)) result = math.exp( -math.pow(self.VectorLength(self.VectorSub(a, b)), 2) / math.pow(sigma, 2)) return result """ Return the legth of vector v """ def VectorLength(self, v): sum = 0 for item in v: sum += item * item return math.sqrt(sum) """ Return the result of the subtraction a - b where a and b are vectors of the same length """ def VectorSub(self, a, b): if (len(a) != len(b)): return None v = np.zeros(len(a), dtype=np.float64) for i in range(0, len(a)): v[i] = a[i] - b[i] return v """ Visualize 2D data """ def visualize2D(self, x, y, c=None, title='', filename=None): fig, ax = plt.subplots(figsize=(13, 6)) ax.set_title(title, fontsize=18) cmap = 'viridis' dot_size = 50 # Check if there are different colored items in the plot if c is not None: for i in range(0, self.n_clusters - 1): temp_c = c[(i * self.size):(i + 1) * self.size] ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap) else: ax.scatter(x, y, s=dot_size) # Save to file or display plot if filename is not None: pyplot.savefig(filename + '.png') pyplot.clf() else: plt.show()
n_neighbors = 10 n_components = 2 method = 'modified' n_jobs = 4 random_state = 2018 # インスタンスの作成 lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method=method, random_state=random_state, n_jobs=n_jobs) # LLEの実行 lle.fit(X_train.loc[0:5000, :]) X_train_lle = lle.transform(X_train) X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index) # プロット表示 scatterPlot(X_train_lle, y_train, "Locally Linear Embedding") # 3.9 t-SNE ------------------------------------------------------------ # <ポイント> # - # t-SNE from sklearn.manifold import TSNE
from data.preprocess import features_preprocess, features_test_preprocess, labels_preprocess labels_all = labels_preprocess() labels_train = labels_all[:38] labels_test = labels_all[38:] features_train = features_preprocess() features_test = features_test_preprocess() print("Dimensionality = ", len(features_train[1])) """ scaler = MinMaxScaler() features_train = scaler.fit_transform(features_train) features_test = scaler.fit_transform(features_test) """ embedding = LocallyLinearEmbedding(n_components=10, n_neighbors=5) features_train = embedding.fit_transform(features_train, labels_train) features_test = embedding.transform(features_test) clf = svm.SVC(kernel='linear') clf.fit(features_train, labels_train) pred = clf.predict(features_test) score = accuracy_score(labels_test, pred) print(score)
def main(): # ----- settings: dataset = 'MNIST' # --> 'Facial' or 'MNIST' or 'Breast_cancer' embedding_method = 'Isomap' n_components = 5 split_in_cross_validation_again = False load_dataset_again = False subset_of_MNIST = True pick_subset_of_MNIST_again = False MNIST_subset_cardinality_training = 10000 # picking from first samples of 60,000 samples MNIST_subset_cardinality_testing = 5000 # picking from first samples of 10,000 samples # ----- paths: if dataset == 'Facial': path_dataset = './input/att_database/' path_dataset_save = './input/pickle_dataset/Facial/' elif dataset == 'MNIST': path_dataset = './input/mnist/' path_dataset_save = './input/pickle_dataset/MNIST/' elif dataset == 'Breast_cancer': path_dataset = './input/Breast_cancer_dataset/wdbc_data.txt' path_dataset_save = './input/pickle_dataset/MNIST/' # ----- Loading dataset: print('Reading dataset...') if dataset == 'MNIST': if load_dataset_again: training_data = list( read_MNIST_dataset(dataset="training", path=path_dataset)) testing_data = list( read_MNIST_dataset(dataset="testing", path=path_dataset)) number_of_training_samples = len(training_data) dimension_of_data = 28 * 28 X_train = np.empty((0, dimension_of_data)) y_train = np.empty((0, 1)) for sample_index in range(number_of_training_samples): if np.mod(sample_index, 1) == 0: print('sample ' + str(sample_index) + ' from ' + str(number_of_training_samples) + ' samples...') label, pixels = training_data[sample_index] pixels_reshaped = np.reshape(pixels, (1, 28 * 28)) X_train = np.vstack([X_train, pixels_reshaped]) y_train = np.vstack([y_train, label]) y_train = y_train.ravel() number_of_testing_samples = len(testing_data) dimension_of_data = 28 * 28 X_test = np.empty((0, dimension_of_data)) y_test = np.empty((0, 1)) for sample_index in range(number_of_testing_samples): if np.mod(sample_index, 1) == 0: print('sample ' + str(sample_index) + ' from ' + str(number_of_testing_samples) + ' samples...') label, pixels = testing_data[sample_index] pixels_reshaped = np.reshape(pixels, (1, 28 * 28)) X_test = np.vstack([X_test, pixels_reshaped]) y_test = np.vstack([y_test, label]) y_test = y_test.ravel() save_variable(X_train, 'X_train', path_to_save=path_dataset_save) save_variable(y_train, 'y_train', path_to_save=path_dataset_save) save_variable(X_test, 'X_test', path_to_save=path_dataset_save) save_variable(y_test, 'y_test', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X_train.pckl', 'rb') X_train = pickle.load(file) file.close() file = open(path_dataset_save + 'y_train.pckl', 'rb') y_train = pickle.load(file) file.close() file = open(path_dataset_save + 'X_test.pckl', 'rb') X_test = pickle.load(file) file.close() file = open(path_dataset_save + 'y_test.pckl', 'rb') y_test = pickle.load(file) file.close() if subset_of_MNIST: if pick_subset_of_MNIST_again: X_train_picked = X_train[ 0:MNIST_subset_cardinality_training, :] X_test_picked = X_test[0:MNIST_subset_cardinality_testing, :] y_train_picked = y_train[0:MNIST_subset_cardinality_training] y_test_picked = y_test[0:MNIST_subset_cardinality_testing] save_variable(X_train_picked, 'X_train_picked', path_to_save=path_dataset_save) save_variable(X_test_picked, 'X_test_picked', path_to_save=path_dataset_save) save_variable(y_train_picked, 'y_train_picked', path_to_save=path_dataset_save) save_variable(y_test_picked, 'y_test_picked', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X_train_picked.pckl', 'rb') X_train_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'X_test_picked.pckl', 'rb') X_test_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'y_train_picked.pckl', 'rb') y_train_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'y_test_picked.pckl', 'rb') y_test_picked = pickle.load(file) file.close() X_train = X_train_picked X_test = X_test_picked y_train = y_train_picked y_test = y_test_picked image_shape = (28, 28) elif dataset == 'Facial': if load_dataset_again: X, y, image_shape = read_image_dataset(dataset_path=path_dataset, imagesType='.jpg') save_variable(variable=X, name_of_variable='X', path_to_save=path_dataset_save) save_variable(variable=y, name_of_variable='y', path_to_save=path_dataset_save) save_variable(variable=image_shape, name_of_variable='image_shape', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X.pckl', 'rb') X = pickle.load(file) file.close() file = open(path_dataset_save + 'y.pckl', 'rb') y = pickle.load(file) file.close() file = open(path_dataset_save + 'image_shape.pckl', 'rb') image_shape = pickle.load(file) file.close() elif dataset == 'Breast_cancer': data = pd.read_csv( path_dataset, sep=",", header=None ) # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas labels_of_classes = ['M', 'B'] X, y = read_BreastCancer_dataset(data=data, labels_of_classes=labels_of_classes) X = X.astype( np.float64 ) #---> otherwise MDS has error --> https://stackoverflow.com/questions/16990996/multidimensional-scaling-fitting-in-numpy-pandas-and-sklearn-valueerror # --- cross validation: path_to_save = './input/split_data/' portion_of_test_in_dataset = 0.3 number_of_folds = 10 if split_in_cross_validation_again: train_indices_in_folds, test_indices_in_folds, \ X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = \ cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset) save_variable(train_indices_in_folds, 'train_indices_in_folds', path_to_save=path_to_save) save_variable(test_indices_in_folds, 'test_indices_in_folds', path_to_save=path_to_save) save_variable(X_train_in_folds, 'X_train_in_folds', path_to_save=path_to_save) save_variable(X_test_in_folds, 'X_test_in_folds', path_to_save=path_to_save) save_variable(y_train_in_folds, 'y_train_in_folds', path_to_save=path_to_save) save_variable(y_test_in_folds, 'y_test_in_folds', path_to_save=path_to_save) for fold_index in range(number_of_folds): save_np_array_to_txt(np.asarray( train_indices_in_folds[fold_index]), 'train_indices_in_fold' + str(fold_index), path_to_save=path_to_save) save_np_array_to_txt(np.asarray( test_indices_in_folds[fold_index]), 'test_indices_in_folds' + str(fold_index), path_to_save=path_to_save) else: file = open(path_to_save + 'train_indices_in_folds.pckl', 'rb') train_indices_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'test_indices_in_folds.pckl', 'rb') test_indices_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'X_train_in_folds.pckl', 'rb') X_train_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'X_test_in_folds.pckl', 'rb') X_test_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'y_train_in_folds.pckl', 'rb') y_train_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'y_test_in_folds.pckl', 'rb') y_test_in_folds = pickle.load(file) file.close() print(X_train.shape) print(X_test.shape) # ----- embedding: print('Embedding...') if dataset == 'MNIST': # plot_components(X_projected=X_projected, images=X.reshape((-1, image_shape[0], image_shape[1])), ax=ax, image_scale=0.6, markersize=10, thumb_frac=0.05, cmap='gray_r') # ----- embedding: if embedding_method == 'LLE': clf = LLE(n_neighbors=5, n_components=n_components, method='standard') clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'Isomap': clf = Isomap(n_neighbors=5, n_components=n_components) clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'MDS': clf = MDS(n_components=n_components) X_projected = clf.fit_transform(X=np.vstack([X_train, X_test])) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'PCA': clf = PCA(n_components=n_components) clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'KernelPCA': clf = KernelPCA(n_components=n_components, kernel='rbf') clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'LaplacianEigenmap': clf = LaplacianEigenmap(n_neighbors=5, n_components=n_components) X_projected = clf.fit_transform(X=np.vstack([X_train, X_test])) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'LDA': clf = LDA(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'SPCA': clf = SPCA(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'TSNE': clf = TSNE(n_components=min(3, n_components)) # print(type(list(y_train))) X_projected = clf.fit_transform( X=np.vstack([X_train, X_test]), y=np.asarray(list(y_train) + list(y_test))) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'ML': clf = ML(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'Kernel_FLDA': clf = Kernel_FLDA(n_components=n_components, kernel='linear') clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'No_embedding': X_train_projected = X_train X_test_projected = X_test # --- classification: print('Classification...') # clf = KNN(n_neighbors=1) clf = NB() clf.fit(X=X_train_projected, y=y_train) y_pred = clf.predict(X=X_test_projected) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) error = 1 - accuracy_score(y_true=y_test, y_pred=y_pred) # --- saving results: save_variable(accuracy, 'accuracy', path_to_save='./output/MNIST/') save_np_array_to_txt(np.asarray(accuracy), 'accuracy', path_to_save='./output/MNIST/') save_variable(error, 'error', path_to_save='./output/MNIST/') save_np_array_to_txt(np.asarray(error), 'error', path_to_save='./output/MNIST/') # --- report results: print(' ') print('Accuracy: ', accuracy * 100) print(' ') print('Error: ', error * 100)
# # Isomap # isomap = Isomap(n_neighbors=4, n_components=2) # isomap.fit(one_hot_data) # isomap_trans = isomap.transform(one_hot_data) # # # 可視化 # fig = plt.figure(figsize=(8,6)) # plt.scatter(isomap_trans[:, 0], isomap_trans[:, 1]) # plt.savefig("img/Isomap_Image/isomap_trans_" + str(data_num) + ".png") # # plt.show() # LocallyLinearEmbedding locally_linear_embedding = LocallyLinearEmbedding(n_neighbors=5, n_components=2) locally_linear_embedding.fit(one_hot_data) locally_linear_embedding_trans = locally_linear_embedding.transform( one_hot_data) # 可視化 fig = plt.figure(figsize=(8, 6)) plt.scatter(locally_linear_embedding_trans[:, 0], locally_linear_embedding_trans[:, 1]) plt.savefig( "img/LocallyLinearEmbedding_Image/locally_linear_embedding_trans_" + str(data_num) + ".png") # plt.show() # tSNE tSNE = TSNE(n_components=2, perplexity=30.0) tSNE_trans = tSNE.fit_transform(one_hot_data) # 可視化
class Cluster: """ Constructor Initializes the class variables necessary for preprocessing the data """ def __init__(self): self.lle = None self.n_clusters = None self.size = None self.iterations = None self.results = None self.n_vectors = 5 self.affinities = ['rbf', 'nearest_neighbors'] self.laplacians = ['custom', 'csgraph'] self.eigvectors = [5, 15] self.clusters = [3, 5, 7, 8] #self.eigvectors = [5, 10, 15, 20] """ Run Locally Linear Embedding and Spectral Clustering on the provided data LLE reduces the data to 2D """ def train(self, x_train, y_train, multiple=False, binary=False): # Set number of clusters self.n_clusters = 2 # Set the size to the training set size self.size = len(x_train) # Create list with numbers from 1 to number of training items self.iterations = np.zeros(self.size) for i in range(0, self.size): self.iterations[i] = i+1 # Apply Locally Linear Embedding on training and testing data x_train = self.LLE(x_train) # Plot training data self.filenale_ = 'multiclass' if binary is True: self.filenale_ = 'binary' self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data ' + self.filenale_, filename='logs/plots/training_data_' + self.filenale_) # Change y_train labels for binary for i in range(0, len(y_train)): if y_train[i] == -1: y_train[i] = 0 # Run SpectralClustering if multiple is True: for affinity in self.affinities: for laplacian in self.laplacians: for vector in self.eigvectors: self.n_vectors = vector if binary is True: self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian) else: for n in self.clusters: self.n_clusters = n self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian) else: if binary is not True: self.n_clusters = 8 self.n_vectors = 8 self.SpectralClustering(x_train, y_train) if multiple is True: for affinity in self.affinities: # Run with sklearns Spectral Clustering sklearn_predicted = self.SklearnSP(x_train, affinity=affinity) title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=' + affinity filename='logs/plots/' + affinity + '_sklearn_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename) else: # Run with sklearns Spectral Clustering sklearn_predicted = self.SklearnSP(x_train) self.logResults(y_train, sklearn_predicted, sklearn=True, affinity=affinity, laplacian=laplacian) title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=rbf' filename='logs/plots/rbf_sklearn_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename) """ Run Spectral Clustering for these data with these parameters affinity=['rbf', 'nearest_neighbors'], laplacian=['custom', 'csgraph'] Default is nearest_neighbors kernel for similarity matrix, custom for laplacian matrix """ def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors', laplacian='custom'): # Get similarity matrix for train data if affinity == 'nearest_neighbors': similarity_matrix = self.NNGraph(x_train) else: similarity_matrix = self.SimilarityMatrix(x_train) # Get laplacian matrix from similarity matrix if laplacian == 'csgraph': laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=False) else: laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix) # Transform data using the laplacian matrix transormed_data = self.transformDataToLaplacian(laplacian_matrix) # Cluster transormed data with kmeans model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0) predicted = model.fit(transormed_data).labels_ self.logResults(y_train, predicted, affinity=affinity, laplacian=laplacian) title = 'Custom SpectralClustering Results ' + self.filenale_ + ", " + 'affinity=' + affinity + ", laplacian=" + laplacian + ", vectors=" + str(self.n_vectors) filename='logs/plots/' + affinity + '_' + laplacian + "_" + str(self.n_vectors) + "_" + str(self.n_clusters) + '_custom_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title=title, filename=filename) """ Create the new data using the laplacian matrix and its eigenvalues and eigenvectors """ def transformDataToLaplacian(self, laplacian_matrix): # Get eigenvalues and eigenvectors from the laplacian matrix eigval, eigvec = np.linalg.eig(laplacian_matrix) # Keep the n_clusters smaller eigenvalues sort_ind = np.argsort(eigval)[: self.n_vectors] # Sort and plot eigenvalues #eigval = np.sort(eigval) # Initialize new array for the transormed data transormed_data = np.zeros((len(laplacian_matrix), self.n_vectors-1), dtype=np.float64) # Create transformed data for i in range(0, len(laplacian_matrix)): # Ignore first eigenvalue as it is close or equal to 0 for j in range(1, self.n_vectors): transormed_data[i][j-1] = eigvec[i, np.asscalar(sort_ind[j])] return transormed_data """ Transform and return data to 2D using LocallyLinearEmbedding """ def LLE(self, data): if self.lle is None: self.lle = LocallyLinearEmbedding(n_components=2) self.lle.fit(data) return self.lle.transform(data) """ Calculate and return the nearest neighbors graph which depicts the distances between each point to another The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix Default limit is 0.4 """ def NNGraph(self, data, limit=0.4): # Create the nearest neighbors graph graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) graph = graph.toarray() return graph """ Calculate and return the similarity matrix using the rbf kernel """ def SimilarityMatrix(self, data, limit=0.4): size = len(data) # Initialize array of size x size with zeros similarity_matrix = np.zeros((size, size), dtype=np.float64) for i in range(0, size): for j in range(0, size): if i != j: value = self.rbf(data[i], data[j], 0.5) #if value <= limit: #similarity_matrix[i][j] = value similarity_matrix[i][j] = value return similarity_matrix """ Calculate and return the Laplacian matrix """ def LaplacianMatrix(self, similarity_matrix): D = np.zeros(similarity_matrix.shape) w = np.sum(similarity_matrix, axis=0) D.flat[::len(w) + 1] = w ** (-0.5) # set the diag of D to w return D.dot(similarity_matrix).dot(D) """ Run sklearn's Spectral Cluster method for comparison """ def SklearnSP(self, x_train, affinity='rbf'): model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity=affinity) model.fit(x_train) y_predict = model.fit_predict(x_train) return y_predict """ Return exp(−||a − b||^2/s^2) where s = sigma """ def rbf(self, a, b, sigma): result = math.exp( -math.pow( self.VectorLength( self.VectorSub(a, b) ) , 2) / math.pow(sigma, 2) ) return result """ Return the legth of vector v """ def VectorLength(self, v): sum = 0 for item in v: sum += item * item return math.sqrt(sum) """ Return the result of the subtraction a - b where a and b are vectors of the same length """ def VectorSub(self, a, b): if (len(a) != len(b)): return None v = np.zeros(len(a), dtype=np.float64) for i in range(0, len(a)): v[i] = a[i] - b[i] return v """ Visualize 2D data """ def visualize2D(self, x, y, c=None, title='', filename=None): fig, ax = plt.subplots(figsize=(13, 6)) ax.set_title(title, fontsize=16) cmap = 'viridis' dot_size=50 # Check if there are different colored items in the plot if c is not None: for i in range(0, self.n_clusters-1) : temp_c = c[ (i*self.size) : (i+1) * self.size] ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap) else: ax.scatter(x, y, s=dot_size) # Save to file or display plot if filename is not None: plt.savefig(filename + '.png') plt.clf() plt.close() else: plt.show() """ Log results """ def logResults(self, y_test, prediction, sklearn=False, affinity='rbf', laplacian='custom'): if sklearn is True: algorithm = 'SKLearn Spectral Clustering' else: algorithm = 'Custom Spectral Clustering' # Calculate precision, recall, f1 result = metrics.precision_recall_fscore_support(y_test, prediction, average='macro') self.results = self.results.append({ 'Algorithm': algorithm, 'Affinity': affinity, 'N_Vectors': str(self.n_vectors), 'Laplacian': laplacian, 'Precision': float("%0.3f"%result[0]), 'Recall': float("%0.3f"%result[1]), 'F1': float("%0.3f"%result[2])}, ignore_index=True) """ Setup results dataframe object """ def setupResults(self): self.results = pd.DataFrame(columns=['Algorithm', 'Affinity', 'Laplacian', 'N_Vectors', 'Precision', 'Recall', 'F1'])
# _/_/_/_/_/_/_/ 次元削減のサンプル _/_/_/_/_/_/_/ # LLE(局所線形埋め込み) # この手法は、次元削減された空間へ写す際に局所的な近傍での距離を保つように射影する。 # データを小さい成分(観測点の近傍)に分割し、線形埋め込みとしてモデル化する。 from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=10, n_components=2, method='modified', random_state=2018, n_jobs=4) lle.fit(x_train.loc[0:5000, :]) x_train_lle = lle.transform(x_train) x_train_lle = pd.DataFrame(data=x_train_lle, index=range(0, len(x_train))) scatter_plot(x_train_lle, y_train, "LLE") # _/_/_/_/_/_/_/ 次元削減のサンプル _/_/_/_/_/_/_/ # t-SNE # この手法では、類似した点は近くなり、類似してない点は遠ざけるようにする。 # 個々の高次元の点を2次元3次元空間にモデル化することで、これを実現する。 # 実用の時は、他の次元削減手法を用いてからt-SNEを用いる。(特徴量ノイズを低減すことができ、高速に実行する) from sklearn.manifold import TSNE t_sne = TSNE(n_components=2,
clf = svm.SVC() clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) elif sys.argv[3] == 'ranfor': from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=50, random_state=0) clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) elif sys.argv[3] == 'lle': from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=int(round(TRAINING_SAMPLE / 5)), n_components=50) lle.fit(Xtrain, Ytrain) Xtrain = lle.transform(Xtrain) X = lle.transform(X) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=50, random_state=0) clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) correct = 0 wrong = 0 for x in range(len(pre)): if pre[x] == Y[x]: correct = correct + 1 else: wrong = wrong + 1
tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True) te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True) tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack((te_vis, te_sem)) tr_labels, te_labels = labels[train_index][:, 0], labels[test_index][:, 0] clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear')) pca.fit(tr_data) clf.fit(pca.transform(tr_data), tr_labels) prediction = clf.predict(pca.transform(te_data)) print('PCA: %f' % balanced_accuracy_score(te_labels, prediction)) lle.fit(tr_data) clf.fit(lle.transform(tr_data), tr_labels) prediction = clf.predict(lle.transform(te_data)) print('LLE: %f' % balanced_accuracy_score(te_labels, prediction)) iso.fit(tr_data) clf.fit(iso.transform(tr_data), tr_labels) prediction = clf.predict(iso.transform(te_data)) print('ISO: %f' % balanced_accuracy_score(te_labels, prediction)) break elapsed = time.time() - init_time hours, rem = divmod(elapsed, 3600) minutes, seconds = divmod(rem, 60) time_elapsed = '{:0>2}:{:0>2}:{:05.2f}'.format(int(hours), int(minutes), seconds) print('Elapsed time is %s' % time_elapsed)