def save_new_data(dataset, n_components, iteration): X, y = load_dataset(dataset) data = X rp = GaussianRandomProjection(n_components=n_components) rp.fit(data) matrix = rp.components_ new_data = rp.transform(data) plot_data('rp', new_data, y, dataset.title() + ': RP', filename='-'.join( ['rp', dataset, str(iteration), 'data', 'trans'])) results = np.array(new_data) np.savetxt('data/' + ('-'.join( [dataset, str(n_components), str(iteration) + 'rp.csv'])), results, delimiter=",") new_data_inv = np.dot(new_data, matrix) loss = metrics.mean_squared_error(data, new_data_inv) print loss
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1), mean_k, np.arange(1, len(mean_k)+1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k) + 1, 1), mean_k, np.arange(1, len(mean_k) + 1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename)
def rp(X, y, n_components='auto', eps=0.1, random_state=None, plot=1, dataset='german'): rp_model = GaussianRandomProjection(n_components=n_components, eps=eps, random_state=random_state) rp_model.fit(X) X_new = rp_model.transform(X) if plot: if dataset == 'german': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("German dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("german-after-Random-Projection.png") plt.close() elif dataset == 'australian': plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1') plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0') plt.title("Australian dataset after Randomized Projection") plt.legend() plt.xlabel("Component 1") plt.ylabel("Component 2") plt.savefig("australian-after-Random-Projection.png") plt.close() return X_new
def comp1(K): Sum_of_squared_distances = [] k = [] accuracy_train = [] accuracy_test = [] score = [] for i in range(1, K): print(i) agglo = GaussianRandomProjection(n_components=10, eps=0.6) #X_new_train,y_new_train=transformer.fit(X_train,y_train) #X_new_test,y_new_test = transformer.transform(X_test,y_test) agglo.fit(X) X_reduced = agglo.transform(X) X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.20) km = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=[8, 8, 8, 8, 8], random_state=1) km.fit(X_train, y_train) km.fit(X_test, y_test) #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5) #transformer2 = GaussianRandomProjection(n_compo label_train = km.predict(X_train) label_test = km.predict(X_test) accu_train = km.score(X_test, y_test) accu_test = km.score(X_train, y_train) #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean') #Sum_of_squared_distances.append(km.inenents=i,eps=0.6) #label=transformer.predicn)rtia_) k.append(i) accuracy_train.append(accu_train) accuracy_test.append(accu_test) #score.append(score_train1) #print(accuracy) k = np.array(k) Sum_of_squared_distances = np.array(Sum_of_squared_distances) score = np.array(score) accuracy_train = np.array(accuracy_train) accuracy_test = np.asarray(accuracy_test) #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o') #line2,=plt.plot(k,score,color='g',marker='o') line3, = plt.plot(k, accuracy_train, color='r', marker='o', label='train_accuracy') line4, = plt.plot(k, accuracy_test, color='g', marker='o', label='test_accuracy') #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)}) plt.xlabel('k') plt.legend() plt.ylabel('accuracy') #plt.ylim(0,1) plt.show() return None
def PerformRandomProjections(X,Y,num_components,random_state): """ For each num_components, random_state number of times random projection is done and that projection is kept that gives minimum reconstruction error """ result = {} recons_errs = [] for n in num_components: prefix = "rp_" + str(n) + "_" best_grp = None best_reconstruction_error = np.Infinity; reconstruction_errors = [] for i in np.arange(random_state) + 1: grp = GaussianRandomProjection(n,random_state=i) grp.fit(X) _x = grp.transform(X) p_inv = np.linalg.pinv(grp.components_) X_recons = np.dot(p_inv,_x.T).T recons_err = ComputeReconstructionSSE(X,X_recons) reconstruction_errors.append(recons_err) #print(r"n = {0} i ={1} error = {2}".format(n,i,recons_err)) if(best_grp is None or best_reconstruction_error > recons_err): best_grp = grp best_reconstruction_error = recons_err result[prefix+"data"] = best_grp.transform(X) result[prefix+"reconstruction_errors_all"] = reconstruction_errors result[prefix+"reconstruction_error"] = best_reconstruction_error return result
def DPPro(pureTrainingData, pureTestingData, k, epsilon, randomProjector=None): ''' projMatrixLength = pureTrainingData.shape[1] * k; oneDimNormalSamples = np.random.normal(0, np.divide(1.0, k), projMatrixLength); projMatrix = np.reshape(oneDimNormalSamples, (pureTrainingData.shape[1], -1)); projTrainingData = np.dot(pureTrainingData, projMatrix); projTestingData = np.dot(pureTestingData, projMatrix); ''' if randomProjector is None: print('Initialize random projector') randomProjector = GaussianRandomProjection(n_components=k) randomProjector.fit(pureTrainingData) projTrainingData = randomProjector.transform(pureTrainingData) projTestingData = randomProjector.transform(pureTestingData) projMatrix_norms = np.linalg.norm(randomProjector.components_, axis=0) # The dimension of projMatrix_norms should be n_features, pureTrainingData.shape[1]; #print(projMatrix_norms.shape); l2Sensitivity = np.amax(projMatrix_norms) delta = np.divide(1.0, pureTrainingData.shape[0]) noiseLength = pureTrainingData.shape[0] * k oneDimNoise = DiffPrivImpl.OneDimGaussian(epsilon, delta, noiseLength, l2Sensitivity=l2Sensitivity) noiseMatrix = np.reshape(oneDimNoise, (pureTrainingData.shape[0], -1)) noisyProjTrainingData = projTrainingData + noiseMatrix return noisyProjTrainingData, projTestingData
def randomfaces(X_train,X_test, n_components=120): t0 = time() randomface = GaussianRandomProjection(n_components=n_components) #Gaussian projection randomface.fit(X_train) X_train_random = randomface.transform(X_train) X_test_random = randomface.transform(X_test) print("Random projection done in %0.3fs" % (time() - t0)) return X_train_random, X_test_random
def randproj(tx, ty, rx, ry): compressor = RandomProjection(tx[1].size) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wRPtr", times=10) km(newtx, ty, newrx, ry, add="wRPtr", times=10) nn(newtx, ty, newrx, ry, add="wRPtr")
def append_cluster_labels(train, test, bestK): """Append best K label to train & test.""" clu = DimRedux(bestK, random_state=42) clu.fit(train.X) train = helpers.Data(clu.transform(train.X), train.y) test = helpers.Data(clu.transform(test.X), test.y) return helpers.scale_test_train(train, test)
def dump_data(data_path, task, reduce_sizes, trials=10): X, y, _, _ = load_data(data_path, is_shuffle=True, is_split=False) pca_components = reduce_sizes[0] pca = PCA(n_components=pca_components, random_state=10) X_PCA = pca.fit_transform(X) X_reconstructed = pca.inverse_transform(X_PCA) print("Reconstruction Error for PCA: %.6f" % np.mean( (X - X_reconstructed)**2)) data = np.hstack((X_PCA, np.array([y]).T)) PCA_path = create_path('data', task, filename='PCA.csv') np.savetxt(PCA_path, data, delimiter=",") ica_components = reduce_sizes[1] ica = FastICA(n_components=ica_components, random_state=10) X_ICA = ica.fit_transform(X) X_reconstructed = ica.inverse_transform(X_ICA) print("Reconstruction Error for ICA: %.6f" % np.mean( (X - X_reconstructed)**2)) data = np.hstack((X_ICA, np.array([y]).T)) ICA_path = create_path('data', task, filename='ICA.csv') np.savetxt(ICA_path, data, delimiter=",") rp_components = reduce_sizes[2] re_list = [] min_re_error = float("inf") X_RP = None for i in range(trials): rp = GaussianRandomProjection(n_components=rp_components) rp.fit(X) X_transformed = rp.transform(X) c_square = np.dot(rp.components_.T, rp.components_) X_reconstructed = np.dot(X_transformed, rp.components_) error = np.mean((X - X_reconstructed)**2) if error < min_re_error: min_re_error = error X_RP = X_transformed re_list.append(error) print(np.mean(re_list)) print(np.std(re_list)) print("Reconstruction Error for RP: %.6f" % min_re_error) data = np.hstack((X_RP, np.array([y]).T)) RP_path = create_path('data', task, filename='RP.csv') np.savetxt(RP_path, data, delimiter=",") mi_components = reduce_sizes[3] X_MI = SelectKBest(mutual_info_classif, k=mi_components).fit_transform(X, y) data = np.hstack((X_MI, np.array([y]).T)) MI_path = create_path('data', task, filename='MI.csv') np.savetxt(MI_path, data, delimiter=",")
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'random_gaussian.model') model = GaussianRandomProjection(n_components=target_output_dim, random_state=42) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return RandomGaussianEncoder(model_path=model_path)
class RandomProjectionSLFN(SLFN): def __init__(self, X, n_neurons, ufunc=np.tanh, random_state=None): self.n_neurons = n_neurons self.ufunc = ufunc self.projection = GaussianRandomProjection(n_components=n_neurons, random_state=random_state) self.projection.fit(X) def transform(self, X): return self.ufunc(self.projection.transform(X))
def randproj(tx, ty, rx, ry): print "randproj" compressor = RandomProjection(tx[1].size) compressor.fit(tx, y=ty) newtx = compressor.transform(tx) # compressor = RandomProjection(tx[1].size) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, add="wRPtr") km(newtx, ty, newrx, ry, add="wRPtr") nn(newtx, ty, newrx, ry, add="wRPtr") print "randproj done"
def rp(self): # manipulating an experiment identifier in the output file self.prefix = self.prefix + 'RandP_' # GRP rf = GaussianRandomProjection(eps=0.5) rf.fit(self.X_train) # applying GRP to the whole training and the test set. self.X_train = rf.transform(self.X_train) self.X_test = rf.transform(self.X_test) self.printDataShapes()
def reduce_embedding_dimensions_GRP(vocab_embeddings_full, dimension, output_path): GRP = GaussianRandomProjection(n_components=dimension, eps=0.5, random_state=2019) GRP.fit(vocab_embeddings_full[:10000, :]) vocab_embeddings_reduced = GRP.transform(vocab_embeddings_full) np.save(os.path.join(output_path, 'vocab_embeddings'), vocab_embeddings_reduced) return vocab_embeddings_reduced
class GaussianRandomProjectionImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def rand_guas(self, n_comp, data=None): if data is None: data = self.train else: data = pd.DataFrame(data) rand_guas = GaussianRandomProjection(n_components=n_comp) rand_guas.fit(data) self.rand_guas_train_data = rand_guas.transform(data) self.RAND_GUAS = rand_guas rand_test = GaussianRandomProjection(n_components=n_comp) rand_test.fit(self.test) self.rand_guas_test_data = rand_test.transform(self.test)
def __call__(self, x, y, train_idx): from sklearn.random_projection import GaussianRandomProjection method = GaussianRandomProjection(n_components=self.n_components, random_state=42) method.fit(x[train_idx]) x_t = method.transform(x) # need to rescale from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(x_t[train_idx]) x_t = scaler.transform(x_t) return x_t
def get_vectors(words, dims): word_vectors = [] for word in words: word_vectors.append(get_word_vector(word)) # convert vectors with specific dimension g = GaussianRandomProjection(dims) g.fit(np.array(word_vectors)) random_mat = g.components_.transpose() vectors = {} for word, word_vector in zip(words, word_vectors): vectors[word] = g.transform(np.array([word_vector]))[0].tolist() return vectors
class Coder(object): def __init__(self, n_sketches, sketch_dim): self.n_sketches = n_sketches self.sketch_dim = sketch_dim self.ss = StandardScaler() self.sp = GaussianRandomProjection(n_components=16 * n_sketches) def fit(self, v): self.ss = self.ss.fit(v) vv = self.ss.transform(v) self.sp = self.sp.fit(vv) vvv = self.sp.transform(vv) self.init_biases(vvv) def transform(self, v): v = self.ss.transform(v) v = self.sp.transform(v) v = self.discretize(v) v = np.packbits(v, axis=-1) v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape( v.shape[0], -1) % self.sketch_dim return v def transform_to_absolute_codes(self, v, labels=None): codes = self.transform(v) pos_index = np.array( [i * self.sketch_dim for i in range(self.n_sketches)], dtype=np.int_) index = codes + pos_index return index
def reducer_rand_proj_gauss(data, params): if params is None: params = {'n_components': 5} X = data['X_train'] y = data['y_train'] reducer = GaussianRandomProjection(n_components=params['n_components']) reducer.fit(X) do = deepcopy(data) do['X_train'] = reducer.transform(data['X_train']) do['X_valid'] = reducer.transform(data['X_valid']) return do
def transform(data, alg): a = np.array(data) x = a[:, 0:-1] y = a[:, -1] if alg == 'pca': pca = PCA(n_components=6, whiten=True) x = pca.fit(x).transform(x) print(pca.components_) print(pca.explained_variance_ratio_) if alg == 'ica': kur0 = sum(kurtosis(x)) ica = FastICA(n_components=3, whiten=False, algorithm="parallel") ica = ica.fit(x) x = ica.transform(x) print("kurtosis: ", sum(kurtosis(x)) - kur0) if alg == 'rp': rp = GaussianRandomProjection(n_components=1) rp = rp.fit(x) x = rp.transform(x) print(rp.components_) if alg == 'vtresh': kb = VarianceThreshold(threshold=.04) x = kb.fit_transform(x) print(kb.variances_) data = np.column_stack((x, y)) return data
def search_best_k(datasets, targets): """Search for best K by Mean Classifier Score.""" plt.figure(figsize=(8, 4)) subindex = 0 for dataset, target in zip(datasets, targets): subindex += 1 logging.info(f"Initializing RP search for {dataset}...") data = helpers.load_dataset_df(dataset) train, test = helpers.split_test_train(data, target) train, test = helpers.scale_test_train(train, test) slf = dict() for k in range(1, train.X.shape[1]): dim = DimRedux(k, random_state=42) dim.fit(train.X) split_ = train.X.shape[0] // 3 * 2 clf1, clf2 = SimpleClf1(), SimpleClf2() clf1.fit(dim.transform(train.X[:split_, ]), train.y[:split_, ]) clf2.fit(dim.transform(train.X[:split_, ]), train.y[:split_, ]) sco1 = clf1.score(dim.transform(train.X[split_:, ]), train.y[split_:, ]) sco2 = clf2.score(dim.transform(train.X[split_:, ]), train.y[split_:, ]) slf[k] = (sco1 + sco2) / 2 plt.subplot(1, len(datasets), subindex) plt.plot(list(slf.keys()), list(slf.values())) plt.xlabel("Components") plt.ylabel("Classifier Train Score") plt.xticks(np.arange(1, train.X.shape[1] + 1, step=1)) plt.title(f"{dataset}", fontsize=10) plt.tight_layout() outpath = os.path.join(helpers.BASEDIR, "img", f"dim-rp-both.png") plt.savefig(outpath) return None
class DReduction: _N_COMP = 0 ### Number of decomposition components ### _pca = 0 _tsvd = 0 _ica = 0 _grp = 0 _srp = 0 def __init__(self, nComp): self._N_COMP = nComp self._pca = PCA(n_components=self._N_COMP, random_state=17) self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17) self._ica = FastICA(n_components=self._N_COMP, random_state=17) self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17) self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17) def fit(self, X): self._pca.fit(X) self._tsvd.fit(X) self._ica.fit(X) self._grp.fit(X) self._srp.fit(X) def transform(self, X): res_pca = self._pca.transform(X) res_tsvd = self._tsvd.transform(X) res_ica = self._ica.transform(X) res_grp = self._grp.transform(X) res_srp = self._srp.transform(X) df = pd.DataFrame() for i in range(1, self._N_COMP + 1): df['pca_' + str(i)] = res_pca[:, i - 1] df['tsvd_' + str(i)] = res_tsvd[:, i - 1] df['ica_' + str(i)] = res_ica[:, i - 1] df['grp_' + str(i)] = res_grp[:, i - 1] df['srp_' + str(i)] = res_srp[:, i - 1] return df
def find_best_state_RCA(X,comp = 2, n_state = 20): reconstuction_error = [] for i in range(n_state): rca = GaussianRandomProjection(n_components=comp,random_state=i) X_r = rca.fit(X).transform(X) X_inverse = np.matmul(X_r, rca.components_) similarity = cosine_similarity(X_inverse,X)[0][0] reconstuction_error.append(similarity) return reconstuction_error
class GaussianRandomProjectionImpl(): def __init__(self, n_components='auto', eps=0.1, random_state=None): self._hyperparams = { 'n_components': n_components, 'eps': eps, 'random_state': random_state } def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self def transform(self, X): return self._sklearn_model.transform(X)
def random_projection(X, y, components, max_cluster, num_classes, run_nn=False): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, shuffle=True) random_proj = GaussianRandomProjection(n_components=components) random_proj.fit(X_train, y=y_train) print(random_proj.components_) X_train_new = random_proj.transform(X_train) X_test_new = random_proj.transform(X_test) inverse_components = np.linalg.pinv(random_proj.components_) reconstructed_instances = utils.extmath.safe_sparse_dot( X_test_new, inverse_components.T) loss = ((X_test - reconstructed_instances)**2).mean() print("Reconstruction Error " + str(loss)) if run_nn: mlp_classifier(X_train_new, y_train, 0.3, plot=True, X_test=X_test_new, y_test=y_test) X_new = np.concatenate((X_train_new, X_test_new), axis=0) y = np.concatenate((y_train, y_test), axis=0) kmeans(X_new, y, max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='Random Projection') expectation_max(X_new, y, max_cluster, num_classes, run_nn=run_nn, plot_cluster=True, reduction_algo='Random Projection')
def randproj(tx, ty, rx, ry, dataset): compressor = RandomProjection(tx[1].size/2) compressor = RandomProjection(tx[1].size/2) compressor.fit(tx, y=ty) pca = RandomProjection(2) pca.fit(tx) result=pd.DataFrame(pca.transform(tx), columns=['RP%i' % i for i in range(2)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111, projection='2d') ax = fig.add_subplot(111) ax.scatter(result['RP0'], result['RP1'], c=my_color, cmap="Dark2_r", s=60) ax.set_xlabel("RP1") ax.set_ylabel("RP2") ax.set_title("RP on the "+ dataset + " data set") plt.show() Store results of PCA in a data frame result=pd.DataFrame(compressor.transform(tx), columns=['ICA%i' % i for i in range(3)]) my_color = pd.Series(ty).astype('category').cat.codes fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(result['ICA0'], result['ICA1'], result['ICA2'], c=my_color, cmap="Dark2_r", s=60) xAxisLine = ((min(result['ICA0']), max(result['ICA0'])), (0, 0), (0,0)) ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r') yAxisLine = ((0, 0), (min(result['ICA1']), max(result['ICA1'])), (0,0)) ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r') zAxisLine = ((0, 0), (0,0), (min(result['ICA2']), max(result['ICA2']))) ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r') ax.set_xlabel("RP1") ax.set_ylabel("RP2") ax.set_zlabel("RP3") ax.set_title("RP on the Car data set") plt.show() reduced_data = RandomProjection(2).fit_transform(tx) em(tx, ty, rx, ry, reduced_data, add="", times=4, dataset=dataset, alg="RP") newtx = compressor.transform(tx) newrx = compressor.transform(rx) em(newtx, ty, newrx, ry, [], add="", times=4, dataset=dataset, alg="RP") em(newtx, ty, newrx, ry, RandomProjection(n_components=2).fit_transform(tx), add="wRPtr", times=9, dataset=dataset, alg="RandProj") # nn(newtx, ty, newrx, ry, add="wRPtr") myNN(newtx, ty, newrx, ry, "RandProj")
def run_rand(data, target, targets, name): plt.subplots(figsize=(18, 10)) for i in range(len(seeds)): transformer = GaussianRandomProjection(n_components=2, random_state=seeds[i]) transformer.fit(data) randTrain = transformer.transform(data) plt.subplot(plots[i]) for i, target_name in zip(targets, targets): plt.scatter(randTrain[target == i, 0], randTrain[target == i, 1], alpha=.8, lw=2, label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title("Randomized Projection of " + name) plt.savefig(name + " random") plt.close()
def test_gaussian_random_projection_float64(self): rng = np.random.RandomState(42) pt = GaussianRandomProjection(n_components=4) X = rng.rand(10, 5).astype(np.float64) model = pt.fit(X) model_onnx = to_onnx(model, X[:1], dtype=np.float64, target_opset=TARGET_OPSET) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, basename="GaussianRandomProjection64")
def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name): scl = RobustScaler() X_train_scl = scl.fit_transform(X_train) ks = [] for i in range(1000): ## ## Random Projection ## rp = GaussianRandomProjection(n_components=X_train_scl.shape[1]) rp.fit(X_train_scl) X_train_rp = rp.transform(X_train_scl) ks.append(kurtosis(X_train_rp)) mean_k = np.mean(ks, 0) ## ## Plots ## ph = plot_helper() title = 'Kurtosis (Randomized Projection) for ' + data_set_name name = data_set_name.lower() + '_rp_kurt' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_simple_bar(np.arange(1, len(mean_k) + 1, 1), mean_k, np.arange(1, len(mean_k) + 1, 1).astype('str'), 'Feature Index', 'Kurtosis', title, filename) ## ## Reconstruction Error ## all_mses, rng = self.reconstruction_error(X_train_scl, GaussianRandomProjection) title = 'Reconstruction Error (RP) for ' + data_set_name name = data_set_name.lower() + '_rp_rec_err' filename = './' + self.out_dir + '/' + name + '.png' ph.plot_series(rng, [all_mses.mean(0)], [all_mses.std(0)], ['mse'], ['red'], ['o'], title, 'Number of Features', 'Reconstruction Error', filename)
class ProjClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_components=1, knn=100): self.n_components = n_components self.knn = knn def fit(self, X, y, sample_weight=None): self.classes_ = numpy.array([0, 1]) self.proj = GaussianRandomProjection(n_components=self.n_components) # self.knner = KNeighborsClassifier(n_neighbors=self.knn) self.knner = Knn1dClassifier(self.knn) self.proj.fit(X) X_new = self.proj.transform(X) # TODO sample weight!! self.knner.fit(X_new, y, sample_weight=sample_weight) print('ok') return self def predict_proba(self, X): X_new = self.proj.transform(X) return self.knner.predict_proba(X_new) def predict(self, X): return numpy.argmax(self.predict_proba(X), axis=1)
champion_items[key['champ']] [(key["patch"], region, tier)] ["second"] [key['second']] += build['value'] champion_items[key['champ']] [(key["patch"], region, tier)] ["third"] [key['third']] += build['value'] #update champion games played champion_games[key['champ']] [(key['patch'], region, tier)] += build['value'] items_json = [] for champ in champion_builds.keys(): # perform GaussianRandomProjection all_builds = [] for key in champion_builds[champ]: all_builds += champion_builds[champ][key] grp = GaussianRandomProjection(2, random_state = 0) grp.fit(all_builds) for key in champion_builds[champ]: builds = champion_builds[champ][key] reduction = grp.transform(builds) # get top 100 builds zipped = zip(list(reduction), build_games[champ][key], build_objects[champ][key]) sorted_zipped = sorted(zipped, key=lambda x: x[1], reverse=True) top_builds = sorted_zipped[0:100] builds_json = [] for i in top_builds: x = list(i[0])[0] y = list(i[0])[1] builds_json.append( {
ap_region_data = { k:v for (k,v) in region_data.items() if ap_champs[k[2]]} ap_tier_data = { k:v for (k,v) in tier_data.items() if ap_champs[k[2]]} ap_cross_data = { k:v for (k,v) in cross_data.items() if ap_champs[k[3]]} ap_patch_data = { k:v for (k,v) in patch_data.items() if ap_champs[k[1]]} all_ap_data = ap_region_data.values() + (ap_tier_data.values()) + ap_cross_data.values() + ap_patch_data.values() #print(ap_champs) #pca = PCA(n_components=2) #reduction = pca.fit_transform(ap_champs.values()) #print(pca.explained_variance_ratio_) grp = GaussianRandomProjection(2, random_state = 0) grp.fit(all_ap_data) region_reduction = grp.transform(ap_region_data.values()) tier_reduction = grp.transform(ap_tier_data.values()) cross_reduction = grp.transform(ap_cross_data.values()) patch_reduction = grp.transform(ap_patch_data.values()) region_json_data = [] for i in range(0,len(ap_region_data.keys())): key = ap_region_data.keys()[i] data = list(region_reduction[i]) num_games = region_games[key] region_json_data.append( { "patch":key[0], "region":key[1], #"tier":key[2], "champion":key[2],
def select_features_GaussianRandomProjections(train_X, train_y, test_X, k): selector = GaussianRandomProjection(n_components=k, random_state=42) selector.fit(train_X) train_X = selector.transform(train_X) test_X = selector.transform(test_X) return train_X, test_X
class RCAReducer(): def __init__(self, dataset, dataset_name, num_components=10): self.dataset = dataset self.dataset_name = dataset_name self.labels = dataset.target self.scaler = MinMaxScaler() self.data = self.scaler.fit_transform(dataset.data) self.n_samples, self.n_features = self.data.shape self.reducer = GaussianRandomProjection(n_components=num_components) def reduce(self): self.reducer.fit(self.data) self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data)) return self.reduced def benchmark(self, estimator, name, data): t0 = time() sample_size = 300 labels = self.labels estimator.fit(data) print('% 9s %.2fs %i %.3f %.3f %.3f %.3f %.3f %.3f' % (name, (time() - t0), estimator.inertia_, metrics.homogeneity_score(labels, estimator.labels_), metrics.completeness_score(labels, estimator.labels_), metrics.v_measure_score(labels, estimator.labels_), metrics.adjusted_rand_score(labels, estimator.labels_), metrics.adjusted_mutual_info_score(labels, estimator.labels_), metrics.silhouette_score(data, estimator.labels_, metric='euclidean', sample_size=sample_size))) def display_reduced_digits(self): sys.stdout = open('out/RCAReduceDigitsOutput.txt', 'w') print("RCA Reduction of %s:\n" % self.dataset_name) print(40 * '-') print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print("\nProjection axes:\n") for i,axis in enumerate(self.reducer.components_.tolist()): print("Axis %d:\n" % i, axis) self.compute_plane_variance() def compute_plane_variance(self): points_along_dimension = self.reduced.T for i,points in enumerate(points_along_dimension): print("\nVariance of dimension %d:" % i) print(np.var(points), "\n") def display_reduced_iris(self): sys.stdout = open('out/RCAReduceIrisOutput.txt', 'w') print("RCA Reduction of %s:\n" % self.dataset_name) print(40 * '-') print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0])) print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0])) print("\nProjection axes:\n") for i,axis in enumerate(self.reducer.components_.tolist()): print("Axis %d:\n" % i, axis) self.compute_plane_variance() def reduce_crossvalidation_set(self, X_train, X_test): self.reducer.fit(X_train) reduced_X_train = self.scaler.transform(X_train) reduced_X_test = self.scaler.transform(X_test) return reduced_X_train, reduced_X_test
# Load 20 newsgroup dataset # Selec tonly sci.crypt category # Other categories include # sci.med sci.space soc.religion.christian cat = ['sci.crypt'] data = fetch_20newsgroups(categories=cat) # Creat a term document matrix with term frequencies as the values frmo the # above dataset vectorizer = TfidfVectorizer(use_idf=False) vector = vectorizer.fit_transform(data.data) # Perform the projection. In this case we reduce the dimension to 1000 gauss_proj = GaussianRandomProjection(n_components=1000) gauss_proj.fit(vector) # Transform the original data to the new space vector_t = gauss_proj.transform(vector) # Print transformed vector shape print vector.shape print vector_t.shape # To validate if the transformation has preserved the distance, we calculate the # old and the new distance between the points org_dist = euclidean_distances(vector) red_dist = euclidean_distances(vector_t) diff_dist = abs(org_dist - red_dist) # We take the difference between these points and plot them as a heatmap (only # the first 1000 documents).