def otherScikitImpl(data, orig_dimension, new_dimension): rp = GaussianRandomProjection(n_components=new_dimension) m = rp._make_random_matrix(new_dimension, orig_dimension) m = np.mat(m) reduced = m * np.mat(data).transpose() reduced = reduced.transpose() return reduced
def run_RCA(X,y,title): dims = list(np.arange(2,(X.shape[1]-1),3)) dims.append(X.shape[1]) tmp = defaultdict(dict) for i,dim in product(range(5),dims): print('round', i) rp = GRP(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X) tmp = pd.DataFrame(tmp).T mean_recon = tmp.mean(axis=1).tolist() std_recon = tmp.std(axis=1).tolist() fig, ax1 = plt.subplots() ax1.plot(dims,mean_recon, 'b-') ax1.set_xlabel('Random Components') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Mean Reconstruction Correlation', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims,std_recon, 'm-') ax2.set_ylabel('STD Reconstruction Correlation', color='m') ax2.tick_params('y', colors='m') plt.grid(False) plt.title("Random Components for 5 Restarts: "+ title) fig.tight_layout() plt.show()
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k) + 1, 1), mean_k, np.arange(1, len(mean_k) + 1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def plot_data(method, X, y, title, filename): fig, (ax1) = plt.subplots(1, 1) n_labels = len(y) if method == 'pca': t = decomposition.PCA(n_components=2) X = t.fit_transform(X) elif method == 'ica': t = decomposition.FastICA(n_components=2, whiten=True) X = t.fit_transform(X) elif method == 'rp': t = GaussianRandomProjection(n_components=2) X = t.fit_transform(X) np.random.seed(20) for label in np.unique(y): ax1.scatter(X[y == label, 0], X[y == label, 1], color=np.random.rand(3), linewidths=1) ax1.set_title(title) ax1.grid() plt.tight_layout() plt.savefig('/'.join(['output', filename])) plt.close("all")
def test_output_transformer(): X, y = datasets.make_multilabel_classification(return_indicator=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # Check that random_state are different transformer = GaussianRandomProjection(n_components=5, random_state=None) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [ sub.output_transformer_.random_state for sub in est.estimators_ ] assert_equal(len(set(random_state)), est.n_estimators) # Check that random_state are equals transformer = FixedStateTransformer( GaussianRandomProjection(n_components=5), random_seed=0) for name, ForestEstimator in FOREST_ESTIMATORS.items(): est = ForestEstimator(random_state=5, output_transformer=transformer) est.fit(X_train, y_train) y_pred = est.predict(X_test) assert_equal(y_pred.shape, y_test.shape) random_state = [ sub.output_transformer_.random_state for sub in est.estimators_ ] assert_equal(len(set(random_state)), 1) assert_equal(random_state[0], 0)
def gaussian_random_projection(A, k): """ Gaussian random projection from sklearn. """ transformer = GaussianRandomProjection(n_components=k) A_embedded = transformer.fit_transform(A) return A_embedded
def red_dim(X_tr, y_tr, *X_tests, meth, classif=True, nFeats=784, post_norm=False): X_tests_ = [] if meth == 'UFS': # 1. UFS score_func = f_classif if classif else f_regression ufs = SelectKBest(score_func=score_func, k=nFeats) X_tr = ufs.fit_transform(X_tr, y_tr) for X_te in X_tests: X_tests_.append(ufs.transform(X_te)) elif meth == 'RFE': # 2. RFE estim = SVC(kernel="linear", C=1) if classif else SVR(kernel="linear") rfe = RFE(estim, n_features_to_select=nFeats, step=0.10) rfe = rfe.fit(X_tr, y_tr) X_tr = X_tr[:, rfe.support_] for X_te in X_tests: X_tests_.append(X_te[:, rfe.support_]) elif meth == 'GRP': # 3. GRP grp = GaussianRandomProjection(n_components=nFeats) X_tr = grp.fit_transform(X_tr, y_tr) for X_te in X_tests: X_tests_.append(grp.transform(X_te)) else: print('Check Dim. Red. Method') if post_norm: logger.info("Applying post-normalization...") ss = StandardScaler().fit(X_tr) X_tr = ss.transform(X_tr) for i in range(len(X_tests_)): X_tests_[i] = ss.transform(X_tests_[i]) logger.info('{} X_train {} '.format(meth, X_tr.shape) + ' '.join(['X_test ({}) {}'.format(i, X_te.shape) for i, X_te in enumerate(X_tests_)])) return X_tr, X_tests_
def project_points(points, dim=None): if dim is None: dim = 5 #dim = min(max(int(np.log(len(points))), 2), 15) proj = GaussianRandomProjection(n_components=dim) return proj.fit_transform(points)
def components(K): Sum_of_squared_distances = [] k = [] accuracy = [] score = [] for i in range(1, K): transformer = GaussianRandomProjection(n_components=i, eps=0.1) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_components=i,eps=0.6) X_new = transformer.fit_transform(X) #label=transformer.predict(X) km = KMeans(n_clusters=2, random_state=0, max_iter=10000, tol=1e-9).fit(X_new) label = km.predict(X_new) accu = matchfn(y, label) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') Sum_of_squared_distances.append(km.inertia_) k.append(i) accuracy.append(accu) #score.append(score_train1) #print(Sum_of_squared_distances) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy = np.array(accuracy) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy, color='r', marker='o') plt.xlabel('k') plt.ylabel('accuracy') #plt.title('Elbow curve Optimal k') #plt.ylim(0,1) plt.show() return None
def rp(X, y, n_components='auto', eps=0.1, random_state=None, plot=1, dataset='german'): rp_model = GaussianRandomProjection(n_components=n_components, eps=eps, random_state=random_state) rp_model.fit(X) X_new = rp_model.transform(X) if plot: if dataset == 'german': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("German dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("german-after-Random-Projection.png") plt.close() elif dataset == 'australian': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("Australian dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("australian-after-Random-Projection.png") plt.close() return X_new
def save_new_data(dataset, n_components, iteration): X, y = load_dataset(dataset) data = X rp = GaussianRandomProjection(n_components=n_components) rp.fit(data) matrix = rp.components_ new_data = rp.transform(data) plot_data('rp', new_data, y, dataset.title() + ': RP', filename='-'.join( ['rp', dataset, str(iteration), 'data', 'trans'])) results = np.array(new_data) np.savetxt('data/' + ('-'.join( [dataset, str(n_components), str(iteration) + 'rp.csv'])), results, delimiter=",") new_data_inv = np.dot(new_data, matrix) loss = metrics.mean_squared_error(data, new_data_inv) print loss
def eps(): Sum_of_squared_distances = [] k = [] score = [] eps = [0.8, 0.6, 0.4, 0.2, 0.05, 0.01] for i in eps: transformer = GaussianRandomProjection(n_components=4, eps=i) X_new = transformer.fit_transform(X) #label=transformer.predict(X) km = KMeans(n_clusters=2, random_state=0, max_iter=10000, tol=1e-9).fit(X_new) #label=km.predict(X_new) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') Sum_of_squared_distances.append(km.inertia_) k.append(i) #score.append(score_train1) print(Sum_of_squared_distances) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) line1, = plt.plot(k, Sum_of_squared_distances, 'bx-', marker='o') #line2,=plt.plot(k,score,color='g',marker='o') plt.xlabel('k') plt.ylabel('Sum_of_squared_distances') plt.title('Elbow curve Optimal eps') plt.show() return None
def __init__(self, nComp): self._N_COMP = nComp self._pca = PCA(n_components=self._N_COMP, random_state=17) self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17) self._ica = FastICA(n_components=self._N_COMP, random_state=17) self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17) self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1), mean_k, np.arange(1, len(mean_k)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def eval_RP(X_train, dims): tmp = defaultdict(dict) for d in dims: pdc = 0 rec_err = 0 tmp[d]['pdc'] = [] tmp[d]['rec'] = [] for i in range(30): rp = GaussianRandomProjection(random_state=i, n_components=d) trans = rp.fit_transform(X_train) pdc = pairwiseDistCorr(trans, X_train) rec_err = reconstructionError(rp, X_train) tmp[d]['pdc'].append(round(pdc, 4)) tmp[d]['rec'].append(round(rec_err, 4)) pd.DataFrame(tmp).T tmp_sum = defaultdict(dict) for d in dims: pdc = 0 rec_err = 0 print(d, round(mean(tmp[d]['pdc']), 3), round(np.std(tmp[d]['pdc']), 3), round(mean(tmp[d]['rec']), 3)) tmp_sum[d] = (d, round(mean(tmp[d]['pdc']), 3), round(np.std(tmp[d]['pdc']), 3), round(mean(tmp[d]['rec']), 3)) return tmp, tmp_sum
def best_rp_nba(self): dh = data_helper() X_train, X_test, y_train, y_test = dh.get_nba_data() scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) X_test_scl = scl.transform(X_test) rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) X_train_transformed = rp.fit_transform(X_train_scl, y_train) X_test_transformed = rp.transform(X_test_scl) ## top 2 kurt = kurtosis(X_train_transformed) i = kurt.argsort()[::-1] X_train_transformed_sorted = X_train_transformed[:, i] X_train_transformed = X_train_transformed_sorted[:,0:2] kurt = kurtosis(X_test_transformed) i = kurt.argsort()[::-1] X_test_transformed_sorted = X_test_transformed[:, i] X_test_transformed = X_test_transformed_sorted[:,0:2] # save filename = './' + self.save_dir + '/nba_rp_x_train.txt' pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_x_test.txt' pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_train.txt' pd.DataFrame(y_train).to_csv(filename, header=False, index=False) filename = './' + self.save_dir + '/nba_rp_y_test.txt' pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
def dRuns(x_data, n): k1 = [] if n == 'PCA' or n == 'SVD': for z in range(2, np.shape(x_data)[1] + 1): if n == 'PCA': pca = PCA(n_components=z) newData = pca.fit_transform(x_data) varRatio = pca.explained_variance_ratio_ elif n == 'SVD': svd = TruncatedSVD(n_components=z) newData = svd.fit_transform(x_data) varRatio = svd.explained_variance_ratio_ if np.sum(varRatio) >= 0.80: return z if n == 'ICA' or n == 'Random': for z in range(2, np.shape(x_data)[1] + 1): if n == 'ICA': ica = FastICA(n_components=z) newData = ica.fit_transform(x_data) newData = pd.DataFrame(newData) k1.append(np.mean(newData.kurt())) else: randProjection = GaussianRandomProjection(n_components=z) newData = randProjection.fit_transform(x_data) newData = pd.DataFrame(newData) k1.append(np.mean(newData.kurt())) return np.argmax(k1) + 2
class Loda: def __init__(self, projections=50, bins=10): self.k = projections self.bins = bins self.rprog, self.histograms = None, None @staticmethod def get_bin_density(v, histogram): hist, bin_edges = histogram for i, be in enumerate(bin_edges): if v <= be: break i = max(i - 1, 0) return i, hist[i] def fit(self, X): if self.k is not None: self.rprog = GaussianRandomProjection(n_components=self.k).fit(X) XX = self.rprog.transform(X) if self.rprog is not None else X self.histograms = [ np.histogram(XX[:, j], bins=self.bins, density=True) for j in range(XX.shape[1]) ] return self def transform(self, X): XX = self.rprog.transform(X) if self.rprog is not None else X anomaly_vect = lambda xx: [ -np.log(self.get_bin_density(xx_j, histo)[1]) for (xx_j, histo) in zip(xx, self.histograms) ] return np.array([anomaly_vect(xx) for xx in XX])
def plot_rca_curve(data): scaler = StandardScaler() scaler.fit(data) x_train_scaler = scaler.transform(data) # reconstruction error by components recon_errs = [] sizes = range(1, 12) stds = [] for size in sizes: losses = [] for state in [5, 30, 50, 200, 0]: rca = GaussianRandomProjection(n_components=size, random_state=state) transformed_data = rca.fit_transform(x_train_scaler) inverse_data = np.linalg.pinv(rca.components_.T) reconstructed_data = transformed_data.dot(inverse_data) loss = ((x_train_scaler - reconstructed_data)**2).mean() losses.append(loss) recon_errs.append(loss) stds.append(np.std(losses)) print(f"rca avg std: {np.mean(stds)}") plt.figure() plt.title('recon error by Number of Components') plt.ylabel('recon error') plt.xlabel('Components') plt.plot(sizes, recon_errs)
def run_k_means_on_random_projections_cardiovascular_data(path): data_set = 'cardio' x_train, y_train = load_data(path + 'data/' + data_set + '/train/') # X, y = load_data(path + 'data/' + data_set + '/train/') pca = GaussianRandomProjection(n_components=5) pca_x_train = pca.fit_transform(x_train) f = open("cardiovascular_random_projections_stats.txt","w+") bench_k_means("1", pca_x_train, y_train, 1, f, 1) bench_k_means("2", pca_x_train, y_train, 2, f, 1) bench_k_means("3", pca_x_train, y_train, 3, f, 1) bench_k_means("4", pca_x_train, y_train, 4, f, 1) bench_k_means("5", pca_x_train, y_train, 5, f, 1) bench_k_means("6", pca_x_train, y_train, 6, f, 1) bench_k_means("7", pca_x_train, y_train, 7, f, 1) bench_k_means("8", pca_x_train, y_train, 8, f, 1) bench_k_means("9", pca_x_train, y_train, 9, f, 1) bench_k_means("10", pca_x_train, y_train, 10, f, 1) bench_k_means("11", pca_x_train, y_train, 11, f, 1) bench_k_means("12", pca_x_train, y_train, 12, f, 1) bench_k_means("13", pca_x_train, y_train, 13, f, 1) bench_k_means("14", pca_x_train, y_train, 14, f, 1) bench_k_means("15", pca_x_train, y_train, 15, f, 1) f.close()
def PerformRandomProjections(X,Y,num_components,random_state): """ For each num_components, random_state number of times random projection is done and that projection is kept that gives minimum reconstruction error """ result = {} recons_errs = [] for n in num_components: prefix = "rp_" + str(n) + "_" best_grp = None best_reconstruction_error = np.Infinity; reconstruction_errors = [] for i in np.arange(random_state) + 1: grp = GaussianRandomProjection(n,random_state=i) grp.fit(X) _x = grp.transform(X) p_inv = np.linalg.pinv(grp.components_) X_recons = np.dot(p_inv,_x.T).T recons_err = ComputeReconstructionSSE(X,X_recons) reconstruction_errors.append(recons_err) #print(r"n = {0} i ={1} error = {2}".format(n,i,recons_err)) if(best_grp is None or best_reconstruction_error > recons_err): best_grp = grp best_reconstruction_error = recons_err result[prefix+"data"] = best_grp.transform(X) result[prefix+"reconstruction_errors_all"] = reconstruction_errors result[prefix+"reconstruction_error"] = best_reconstruction_error return result
def dimensionality_reduction(): ica_best_components = 5 pca_best_components = 6 rp_chosen_components = 3 variance_threshold = 0.02 pca = PCA(n_components=pca_best_components) pca_x_train = pca.fit_transform(x_train) base_experiment.plot_eigen_values("{}-{}".format(plot_name, "PCA"), pca.explained_variance_) base_experiment.plot_points_3d("{}-{}".format(plot_name, "PCA"), pca_x_train) ica = FastICA(n_components=ica_best_components) ica_x_train = ica.fit_transform(x_train) base_experiment.plot_points_3d("{}-{}".format(plot_name, "ICA"), ica_x_train) rp = GaussianRandomProjection(n_components=rp_chosen_components) rp_x_train = rp.fit_transform(x_train) base_experiment.plot_points_3d( "{}-{}".format(plot_name, "Random Projection"), rp_x_train) variance_x_train = VarianceThreshold( threshold=variance_threshold).fit_transform( min_max_scaler.transform(features_data)) variance_x_train = preprocessing.scale(variance_x_train) find_best_k_for_reduced_features(ica_x_train, pca_x_train, rp_x_train, variance_x_train) clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train) run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train, rp_x_train, variance_x_train)
def rp(name, x, y): plot.style.use('seaborn-darkgrid') for i in range(6): rp = GaussianRandomProjection(eps=0.95, random_state=i) transformed = rp.fit_transform(x) axes = [0, 0] axes_std = [0, 0] for axis in range(np.shape(transformed)[1]): std = np.std(transformed[:, axis]) if std > axes_std[0]: axes[0] = axis axes_std[0] = std elif std > axes_std[1]: axes[1] = axis axes_std[1] = std plot.subplot(2, 3, i + 1) plot.title(f'Random seed = {i}') plot.xlabel(f'Dimension {axes[0]}') plot.ylabel(f'Dimension {axes[1]}') plot.scatter(transformed[:, axes[0]], transformed[:, axes[1]], c=y, cmap='viridis') plot.show()
def rand_proj_reconstruction_error(train_x, n): ''' ''' results = [] for i in range(1, n, 10): for j in range(1, 11): error = 0 rand_proj = GaussianRandomProjection(n_components=n) reduced_df = rand_proj.fit_transform(train_x) psuedo_inverse = np.linalg.pinv(rand_proj.components_.T) reconstructed = reduced_df.dot(psuedo_inverse) error += metrics.mean_squared_error(train_x, reconstructed) # # error = (np.linalg.norm(train_x - reconstructed) ** 2) / len(train_x) # # error = np.sum(np.square(train_x - reconstructed)) # error = np.mean((train_x - reconstructed)**2) # error = ((train_x - reconstructed) ** 2).sum(1).mean() results.append({"n_components": i, "reconstruction_error": error / 10}) return results
class Coder(object): def __init__(self, n_sketches, sketch_dim): self.n_sketches = n_sketches self.sketch_dim = sketch_dim self.ss = StandardScaler() self.sp = GaussianRandomProjection(n_components=16 * n_sketches) def fit(self, v): self.ss = self.ss.fit(v) vv = self.ss.transform(v) self.sp = self.sp.fit(vv) vvv = self.sp.transform(vv) self.init_biases(vvv) def transform(self, v): v = self.ss.transform(v) v = self.sp.transform(v) v = self.discretize(v) v = np.packbits(v, axis=-1) v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape( v.shape[0], -1) % self.sketch_dim return v def transform_to_absolute_codes(self, v, labels=None): codes = self.transform(v) pos_index = np.array( [i * self.sketch_dim for i in range(self.n_sketches)], dtype=np.int_) index = codes + pos_index return index
def transform(data, alg): a = np.array(data) x = a[:, 0:-1] y = a[:, -1] if alg == 'pca': pca = PCA(n_components=6, whiten=True) x = pca.fit(x).transform(x) print(pca.components_) print(pca.explained_variance_ratio_) if alg == 'ica': kur0 = sum(kurtosis(x)) ica = FastICA(n_components=3, whiten=False, algorithm="parallel") ica = ica.fit(x) x = ica.transform(x) print("kurtosis: ", sum(kurtosis(x)) - kur0) if alg == 'rp': rp = GaussianRandomProjection(n_components=1) rp = rp.fit(x) x = rp.transform(x) print(rp.components_) if alg == 'vtresh': kb = VarianceThreshold(threshold=.04) x = kb.fit_transform(x) print(kb.variances_) data = np.column_stack((x, y)) return data
def comp1(K): Sum_of_squared_distances = [] k = [] accuracy_train = [] accuracy_test = [] score = [] for i in range(1, K): print(i) agglo = GaussianRandomProjection(n_components=10, eps=0.6) #X_new_train,y_new_train=transformer.fit(X_train,y_train) #X_new_test,y_new_test = transformer.transform(X_test,y_test) agglo.fit(X) X_reduced = agglo.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20) km = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=[8, 8, 8, 8, 8], random_state=1) km.fit(X_train, y_train) km.fit(X_test, y_test) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_compo label_train = km.predict(X_train) label_test = km.predict(X_test) accu_train = km.score(X_test, y_test) accu_test = km.score(X_train, y_train) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') #Sum_of_squared_distances.append(km.inenents=i,eps=0.6) #label=transformer.predicn)rtia_) k.append(i) accuracy_train.append(accu_train) accuracy_test.append(accu_test) #score.append(score_train1) #print(accuracy) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy_train = np.array(accuracy_train) accuracy_test = np.asarray(accuracy_test) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy_train, color='r', marker='o', label='train_accuracy') line4, = plt.plot(k, accuracy_test, color='g', marker='o', label='test_accuracy') #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.xlabel('k') plt.legend() plt.ylabel('accuracy') #plt.ylim(0,1) plt.show() return None
def rand_proj(train_x, n): ''' ''' rp = GaussianRandomProjection(n_components=n) reduced_df = rp.fit_transform(train_x) return reduced_df
def RP_exp(X, y, title): ncomp= [i+1 for i in range(X.shape[1]-1)] stdev=[] mean=[] for n in ncomp: repeats = [] for i in range(5): rp = GaussianRandomProjection(n_components=n) temp = rp.fit_transform(X) repeats.append(temp) diffs = [] for (i, j) in [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]: diffs.append(repeats[i] - repeats[j]) stdev.append(np.std(diffs)) mean.append(np.mean(diffs)) comp_arr=np.array(ncomp) mean_arr=np.array(mean) stdev_arr=np.array(stdev) plt.fill_between(comp_arr, mean_arr-stdev_arr, mean_arr + stdev_arr, alpha=0.1, color="b", label="Stdev") plt.plot(ncomp, mean, 'o-', color="b", label="Mean") plt.title("Mean pairwise difference of RP: "+ title) plt.legend(loc='best') plt.xlabel("n_components") plt.ylabel("Pairwise difference") plt.savefig("RP "+title) plt.show()
def randproj(tx, ty, rx, ry): compressor = RandomProjection(tx[1].size) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wRPtr", times=10) km(newtx, ty, newrx, ry, add="wRPtr", times=10) nn(newtx, ty, newrx, ry, add="wRPtr")
def otherScikitImpls(data): rp = GaussianRandomProjection(n_components=new_dimension) m = rp._make_random_matrix(new_dimension, orig_dimension) m = np.mat(m) reduced = m * np.mat(data).transpose() reduced = reduced.transpose() return reduced
def randproj(tx, ty, rx, ry): compressor = RandomProjection(tx[1].size) newtx = compressor.fit_transform(tx) compressor = RandomProjection(tx[1].size) newrx = compressor.fit_transform(rx) #em(newtx, ty, newrx, ry, add="wRPtr", times=10) #km(newtx, ty, newrx, ry, add="wRPtr", times=10) nn(newtx, ty, newrx, ry, add="wRP")
def run_grp(n_c, X_train, X_test, y_train, y_test): from sklearn.random_projection import GaussianRandomProjection grp = GaussianRandomProjection(n_components=n_c, eps=0.1) X_train = grp.fit_transform(X_train, y_train) X_test = grp.transform(X_test) print("grp components: ", grp.n_components_) return [X_train, X_test]
def randproj(tx, ty, rx, ry): compressor = RandomProjection(tx[1].size) newtx = compressor.fit_transform(tx) compressor = RandomProjection(tx[1].size) newrx = compressor.fit_transform(rx) em(newtx, ty, newrx, ry, add="wRPtr", times=10) km(newtx, ty, newrx, ry, add="wRPtr", times=10) nn(newtx, ty, newrx, ry, add="wRPtr")
def randproj(tx, ty, rx, ry): print "randproj" compressor = RandomProjection(tx[1].size) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) # compressor = RandomProjection(tx[1].size) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wRPtr") km(newtx, ty, newrx, ry, add="wRPtr") nn(newtx, ty, newrx, ry, add="wRPtr") print "randproj done"
def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = GaussianRandomProjection(n_components=num_components)
def find_best_rp(train_data, test_data, start_components, max_features): # find random proj with lowest reconstruction error best_c = 0 #scores = [] #best_k = 0 #best = 0 best_rs = 0 best_r = 20000 # go to max - 1 since it doesn't make sense to randomly project to the same dimension r = range(start_components, max_features) print(r) # center data for reconstruction scalar = StandardScaler(with_mean=True, with_std=False) centered = scalar.fit_transform(test_data) for c in r: print('C=%d' % c) for rs in range(1, 501): rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data) fit = rp.transform(centered) recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_ err = linalg.norm(test_data - recon) if err < best_r: best_r = err best_c = c best_rs = rs print('best reconstruction error=%.4f' % best_r) print('>>best rs=%d,c=%d' % (best_rs, best_c)) # for the best, track the variation v_max = 0 errsum = 0 for rs in range(1, 501): rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data) fit = rp.transform(centered) recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_ err = linalg.norm(test_data - recon) errsum += err if err > v_max: v_max = err print('RP max:%.3f, avg:%.3f' % (v_max, errsum/500)) return best_c, best_rs
def fit(self, X, y, sample_weight=None): self.classes_ = numpy.array([0, 1]) self.proj = GaussianRandomProjection(n_components=self.n_components) # self.knner = KNeighborsClassifier(n_neighbors=self.knn) self.knner = Knn1dClassifier(self.knn) self.proj.fit(X) X_new = self.proj.transform(X) # TODO sample weight!! self.knner.fit(X_new, y, sample_weight=sample_weight) print('ok') return self
def test_fixed_state_transformer(): random_state = check_random_state(0) X = random_state.rand(500, 100) # Check that setting the random_seed is equivalent to set the # random_state transf = GaussianRandomProjection(n_components=5, random_state=0) fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5), random_seed=0) assert_array_almost_equal(fixed_transf.fit_transform(X), transf.fit_transform(X)) # Check that set_params doesn't modify the results fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5, random_state=None)) fixed_transf2 = FixedStateTransformer(GaussianRandomProjection(random_state=1, n_components=5)) assert_array_almost_equal(fixed_transf.fit_transform(X), fixed_transf2.fit_transform(X)) # Check that it work when there is no random_state fixed_transf = FixedStateTransformer(IdentityProjection()) assert_array_almost_equal(fixed_transf.fit_transform(X), X)
class ProjClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_components=1, knn=100): self.n_components = n_components self.knn = knn def fit(self, X, y, sample_weight=None): self.classes_ = numpy.array([0, 1]) self.proj = GaussianRandomProjection(n_components=self.n_components) # self.knner = KNeighborsClassifier(n_neighbors=self.knn) self.knner = Knn1dClassifier(self.knn) self.proj.fit(X) X_new = self.proj.transform(X) # TODO sample weight!! self.knner.fit(X_new, y, sample_weight=sample_weight) print('ok') return self def predict_proba(self, X): X_new = self.proj.transform(X) return self.knner.predict_proba(X_new) def predict(self, X): return numpy.argmax(self.predict_proba(X), axis=1)
tsvd = TruncatedSVD(n_components=n_comp, random_state=420) tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1)) tsvd_results_test = tsvd.transform(test) # PCA pca = PCA(n_components=n_comp, random_state=420) pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1)) pca2_results_test = pca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # Append decomposition components to datasets for i in range(1, n_comp+1): train['pca_' + str(i)] = pca2_results_train[:,i-1] test['pca_' + str(i)] = pca2_results_test[:, i-1] train['ica_' + str(i)] = ica2_results_train[:,i-1] test['ica_' + str(i)] = ica2_results_test[:, i-1]
def gen_feature(train, test): train = pd.DataFrame(train) test = pd.DataFrame(test) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, test
ap_region_data = { k:v for (k,v) in region_data.items() if ap_champs[k[2]]} ap_tier_data = { k:v for (k,v) in tier_data.items() if ap_champs[k[2]]} ap_cross_data = { k:v for (k,v) in cross_data.items() if ap_champs[k[3]]} ap_patch_data = { k:v for (k,v) in patch_data.items() if ap_champs[k[1]]} all_ap_data = ap_region_data.values() + (ap_tier_data.values()) + ap_cross_data.values() + ap_patch_data.values() #print(ap_champs) #pca = PCA(n_components=2) #reduction = pca.fit_transform(ap_champs.values()) #print(pca.explained_variance_ratio_) grp = GaussianRandomProjection(2, random_state = 0) grp.fit(all_ap_data) region_reduction = grp.transform(ap_region_data.values()) tier_reduction = grp.transform(ap_tier_data.values()) cross_reduction = grp.transform(ap_cross_data.values()) patch_reduction = grp.transform(ap_patch_data.values()) region_json_data = [] for i in range(0,len(ap_region_data.keys())): key = ap_region_data.keys()[i] data = list(region_reduction[i]) num_games = region_games[key] region_json_data.append( { "patch":key[0], "region":key[1], #"tier":key[2],
def random(X, K): grp = GaussianRandomProjection(n_components=K) X_red = grp.fit_transform(X) X_red = normalizer.fit_transform(X_red) return X_red
import numpy as np # Load 20 newsgroup dataset # Selec tonly sci.crypt category # Other categories include # sci.med sci.space soc.religion.christian cat = ['sci.crypt'] data = fetch_20newsgroups(categories=cat) # Creat a term document matrix with term frequencies as the values frmo the # above dataset vectorizer = TfidfVectorizer(use_idf=False) vector = vectorizer.fit_transform(data.data) # Perform the projection. In this case we reduce the dimension to 1000 gauss_proj = GaussianRandomProjection(n_components=1000) gauss_proj.fit(vector) # Transform the original data to the new space vector_t = gauss_proj.transform(vector) # Print transformed vector shape print vector.shape print vector_t.shape # To validate if the transformation has preserved the distance, we calculate the # old and the new distance between the points org_dist = euclidean_distances(vector) red_dist = euclidean_distances(vector_t) diff_dist = abs(org_dist - red_dist) # We take the difference between these points and plot them as a heatmap (only
# Let's first generate a set of samples n_samples = 2000 n_outputs = 500 X = 3 + 5 * random_state.normal(size=(n_samples, n_outputs)) # Let's compute the sum of the variance in the orignal output space var_origin = np.var(X, axis=0).sum() # Let's compute the variance on a random subspace all_n_components = np.array([1, 50, 100, 200, 400, 500]) n_repetitions = 10 distortion = np.empty((len(all_n_components), n_repetitions)) for i, n_components in enumerate(all_n_components): for j in range(n_repetitions): transformer = GaussianRandomProjection(n_components=n_components, random_state=random_state) X_subspace = transformer.fit_transform(X) distortion[i, j] = np.var(X_subspace, axis=0).sum() / var_origin # Let's plot the distortion as a function of the compression ratio distortion_mean = distortion.mean(axis=1) distortion_std = distortion.std(axis=1) plt.figure() plt.plot(all_n_components / n_outputs, distortion_mean, "o-", color="g") plt.plot(all_n_components / n_outputs, np.ones_like(distortion_mean), "--", color="r") plt.fill_between(all_n_components / n_outputs, distortion_mean - distortion_std, distortion_mean + distortion_std, alpha=0.25, color="g") plt.xlabel("n_components / n_outputs")
def select_features_GaussianRandomProjections(train_X, train_y, test_X, k): selector = GaussianRandomProjection(n_components=k, random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
def Random_Projection(M, new_dim, prng): proj = GaussianRandomProjection(n_components=new_dim, eps=0.1, random_state=None) return proj.fit_transform(M)
def gen_features(train, val, test): train = pd.DataFrame(train) val = pd.DataFrame(val) test = pd.DataFrame(test) # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year', # 'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days', # 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel', # 'transaction_date_year', 'transaction_date_month', 'transaction_date_date', # 'membership_expire_date_year', # 'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap', # 'cancel_times', # 'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month', # 'user_date_date'] # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']] # train[cat_cols] = train[cat_cols].astype('object') # test[cat_cols] = test[cat_cols].astype('object') # val[cat_cols] = val[cat_cols].astype('object') # # for col in cat_cols: # train[col].fillna(value=train[col].mode()[0], inplace=True) # test[col].fillna(value=test[col].mode()[0], inplace=True) # val[col].fillna(value=val[col].mode()[0], inplace=True) # for col in con_cols: # train[col].fillna(value=train[col].mean(), inplace=True) # test[col].fillna(value=test[col].mean(), inplace=True) # val[col].fillna(value=val[col].mean(), inplace=True) # # for c in train.columns: # if train[c].dtype == 'object': # lbl = LabelEncoder() # lbl.fit(list(train[c].values) + list(test[c].values)) # train[c] = lbl.transform(list(train[c].values)) # test[c] = lbl.transform(list(test[c].values)) n_comp = 15 drop_list = [] test_drop_list = [] print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape) print('tSVD', datetime.now() - start) # tSVD tsvd = TruncatedSVD(n_components=n_comp) tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1)) tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1)) tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1)) print('PCA', datetime.now() - start) # PCA pca = PCA(n_components=n_comp) pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1)) pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1)) pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1)) print('ICA', datetime.now() - start) # ICA ica = FastICA(n_components=n_comp, max_iter=10000) ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1)) ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1)) ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1)) print('GRP', datetime.now() - start) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1) grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1)) grp_results_val = grp.transform(val.drop(test_drop_list, axis=1)) grp_results_test = grp.transform(test.drop(test_drop_list, axis=1)) print('SRP', datetime.now() - start) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True) srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1)) srp_results_val = srp.transform(val.drop(test_drop_list, axis=1)) srp_results_test = srp.transform(test.drop(test_drop_list, axis=1)) # MCA # res_mca = MCA(train, ncp=n_comp, graph = FALSE) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(drop_list)) # Append decomposition components to datasets for i in range(1, n_comp + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] val['pca_' + str(i)] = pca2_results_val[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['ica_' + str(i)] = ica2_results_train[:, i - 1] val['ica_' + str(i)] = ica2_results_val[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] train['grp_' + str(i)] = grp_results_train[:, i - 1] val['grp_' + str(i)] = grp_results_val[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] train['srp_' + str(i)] = srp_results_train[:, i - 1] val['srp_' + str(i)] = srp_results_val[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] return train, val, test
# Perform Truncated Singular Value Decomposition (SVD) from sklearn.decomposition import TruncatedSVD as TruncSVD tsvd = TruncSVD(n_components=num_components, algorithm='randomized', random_state=0) tsvd_transformed_data_train = tsvd.fit_transform(sparse_trainData) tsvd_transformed_data_valid = tsvd.transform(sparse_validData) # Perform Randomized Principal Components Analysis (PCA) from sklearn.decomposition import RandomizedPCA as RPCA rpca = RPCA(n_components=num_components) rpca_transformed_data_train = rpca.fit_transform(dense_trainData) rpca_transformed_data_valid = rpca.transform(dense_validData) # Perform Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection as GaussRan grp = GaussRan(n_components=num_components) grp_transformed_data_train = grp.fit_transform(dense_trainData) grp_transformed_data_valid = grp.transform(dense_validData) # Perform Sparse Random Projection from sklearn.random_projection import SparseRandomProjection as SparseRan srp = SparseRan(n_components=num_components, random_state=0) srp_transformed_data_train = srp.fit_transform(dense_trainData) srp_transformed_data_valid = srp.transform(dense_validData) # Perform classification using 1-Nearest Neighbor Classifier from sklearn.neighbors import KNeighborsClassifier # Create a subset grid to plot performance against numbers of components tsvd_max = tsvd_transformed_data_train.shape[1] plot_subset = []
int10 = np.array(list(map(map_bin_dec, bin_feats))) int10 = int10 / max(int10) df_non_obj_feats['binSum'] = df_non_obj_feats.apply(sum, 1) df_non_obj_feats['binDec'] = int10 all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1) #%% from sklearn.decomposition import PCA, FastICA from sklearn.random_projection import GaussianRandomProjection from sklearn.random_projection import SparseRandomProjection n_comp = 12 # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results = grp.fit_transform(all_data_proc) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results = srp.fit_transform(all_data_proc) # PCA pca = PCA(n_components=n_comp, random_state=420) pca_results = pca.fit_transform(all_data_proc) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica_results = ica.fit_transform(all_data_proc) for i in range(1, n_comp+1): all_data_proc['pca_' + str(i)] = pca_results[:,i-1]
def gaussianRP(data): rp = GaussianRandomProjection(n_components=new_dimension) return rp.fit_transform(data)
from sklearn.mixture import GMM from load_mydata import LoadData import math mushroom = LoadData("mushroom") data = scale(mushroom.data) labels = np.array(mushroom.labels) n_samples, n_features = data.shape n_digits = len(np.unique(labels)) n_iter = 1000 print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) t0 = time() rp = GaussianRandomProjection(n_components=20) reduced_data = rp.fit_transform(data) print("time spent: %0.3fs" % (time()-t0)) #reduced_data = data # Plot the data fig=plt.figure() #plt.clf() n_plots=9 h = 0.02 x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() for index in range(1,n_plots+1): vert=math.floor(math.sqrt(n_plots)) hori=n_plots/vert fig.add_subplot(vert,hori,index)
class RCAReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = GaussianRandomProjection(n_components=num_components) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/RCAReduceDigitsOutput.txt', 'w') print("RCA Reduction of %s:\n" % self.dataset_name) print(40 * '-') print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print("\nProjection axes:\n") for i,axis in enumerate(self.reducer.components_.tolist()): print("Axis %d:\n" % i, axis) self.compute_plane_variance() def compute_plane_variance(self): points_along_dimension = self.reduced.T for i,points in enumerate(points_along_dimension): print("\nVariance of dimension %d:" % i) print(np.var(points), "\n") def display_reduced_iris(self): sys.stdout = open('out/RCAReduceIrisOutput.txt', 'w') print("RCA Reduction of %s:\n" % self.dataset_name) print(40 * '-') print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print("\nProjection axes:\n") for i,axis in enumerate(self.reducer.components_.tolist()): print("Axis %d:\n" % i, axis) self.compute_plane_variance() def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
champion_items[key['champ']] [(key["patch"], region, tier)] ["first"] [key['first']] += build['value'] champion_items[key['champ']] [(key["patch"], region, tier)] ["second"] [key['second']] += build['value'] champion_items[key['champ']] [(key["patch"], region, tier)] ["third"] [key['third']] += build['value'] #update champion games played champion_games[key['champ']] [(key['patch'], region, tier)] += build['value'] items_json = [] for champ in champion_builds.keys(): # perform GaussianRandomProjection all_builds = [] for key in champion_builds[champ]: all_builds += champion_builds[champ][key] grp = GaussianRandomProjection(2, random_state = 0) grp.fit(all_builds) for key in champion_builds[champ]: builds = champion_builds[champ][key] reduction = grp.transform(builds) # get top 100 builds zipped = zip(list(reduction), build_games[champ][key], build_objects[champ][key]) sorted_zipped = sorted(zipped, key=lambda x: x[1], reverse=True) top_builds = sorted_zipped[0:100] builds_json = [] for i in top_builds: x = list(i[0])[0] y = list(i[0])[1]