processor.latext_start_figure() X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans') n_clusters = len(dataset.label_encoder.classes_) pca = PCA(n_components=0.95) pca.fit(X_train) n_components = pca.components_.shape[0] print(f"n_components: {n_components}") dr_models = [ PCA(n_components=n_components, random_state=0), FastICA(n_components=n_components, random_state=0), MiniBatchDictionaryLearning(n_components=n_components, alpha=1, batch_size=200, n_iter=10, random_state=0), SparseRandomProjection(random_state=0, n_components=n_components) ] clustering_models = [ KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=600, random_state=0, tol=0.0001), GaussianMixture(n_components=n_clusters, n_init=10, max_iter=600, random_state=0, tol=0.0001) ] for pca in dr_models:
split = train_test_split(X, y, test_size = 0.33, random_state = 42) #digits = datasets.load_digits() #split = train_test_split(digits.data, digits.target, test_size = 0.3, # random_state = 42) (trainData, testData, trainTarget, testTarget) = split model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components = comp) X_new = sp.fit_transform(trainData) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X_new, trainTarget) # evaluate the model and update the list of accuracies test = sp.transform(testData) accuracies.append(metrics.accuracy_score(model.predict(test), testTarget)) # create the figure plt.figure() plt.suptitle("Accuracy of Sparse Projection on Digits") plt.xlabel("# of Components") plt.ylabel("Accuracy")
kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1)) kpca2_results_test = kpca.transform(test) # ICA ica = FastICA(n_components=n_comp, random_state=420) ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1)) ica2_results_test = ica.transform(test) # GRP grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420) grp_results_train = grp.fit_transform(train.drop(["y"], axis=1)) grp_results_test = grp.transform(test) # SRP srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train.drop(["y"], axis=1)) srp_results_test = srp.transform(test) # save columns list before adding the decomposition components usable_columns = list(set(train.columns) - set(['y'])) # Append decomposition components to datasets for i in range(1, n_comp + 1): # train['pca_' + str(i)] = pca2_results_train[:, i - 1] # test['pca_' + str(i)] = pca2_results_test[:, i - 1] train['spca_' + str(i)] = spca2_results_train[:, i - 1] test['spca_' + str(i)] = spca2_results_test[:, i - 1]
madelon = pd.read_hdf('./BASE/datasets.hdf','madelon') madelonX = madelon.drop('Class',1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) digitsX= StandardScaler().fit_transform(digitsX) clusters = [2,5,10,15,20,25,30,35,40] dims = [2,5,10,15,20,25,30,35,40,45,50,55,60] #raise #%% data for 1 tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'madelon scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX) tmp =pd.DataFrame(tmp).T tmp.to_csv(out+'digits scree1.csv') tmp = defaultdict(dict) for i,dim in product(range(10),dims):
def johnson_lindenstrauss(data, data_name): # `normed` is being deprecated in favor of `density` in histograms if LooseVersion(matplotlib.__version__) >= '2.1': density_param = {'density': True} else: density_param = {'normed': True} # Part 1: plot the theoretical dependency between n_components_min and # n_samples # range of admissible distortions eps_range = np.linspace(0.1, 0.99, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range))) # range of number of samples (observation) to embed n_samples_range = np.logspace(1, 9, 9) plt.figure() for eps, color in zip(eps_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps) plt.loglog(n_samples_range, min_n_components, color=color) plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right") plt.xlabel("Number of observations to eps-embed") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components") plt.savefig('Figs/02b_rp_comp_samples') # range of admissible distortions eps_range = np.linspace(0.01, 0.99, 100) n_samples_range = np.logspace(2, 6, 5) colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range))) plt.figure() for n_samples, color in zip(n_samples_range, colors): min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range) plt.semilogy(eps_range, min_n_components, color=color) plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right") plt.xlabel("Distortion eps") plt.ylabel("Minimum number of dimensions") plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps") plt.savefig('Figs/02b_rp_comp_eps') # Part 2: perform sparse random projection of some digits images which are # quite low dimensional and dense or documents of the 20 newsgroups dataset # which is both high dimensional and sparse n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([1,10,100,1000]) dists = euclidean_distances(data, squared=True).ravel() # select only non-identical samples pairs nonzero = dists != 0 dists = dists[nonzero] for n_components in n_components_range: t0 = time() rp = SparseRandomProjection(n_components=n_components) projected_data = rp.fit_transform(data) print("Projected %d samples from %d to %d in %0.3fs" % (n_samples, n_features, n_components, time() - t0)) if hasattr(rp, 'components_'): n_bytes = rp.components_.data.nbytes n_bytes += rp.components_.indices.nbytes print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6)) projected_dists = euclidean_distances( projected_data, squared=True).ravel()[nonzero] plt.figure() plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu) plt.xlabel("Pairwise squared distances in original space") plt.ylabel("Pairwise squared distances in projected space") plt.title("Pairwise distances distribution for n_components=%d" % n_components) cb = plt.colorbar() cb.set_label('Sample pairs counts') rates = projected_dists / dists print("Mean distances rate: %0.2f (%0.2f)" % (np.mean(rates), np.std(rates))) plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components)) plt.figure() plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param) plt.xlabel("Squared distances rate: projected / original") plt.ylabel("Distribution of samples pairs") plt.title("Histogram of pairwise distance rates for n_components=%d" % n_components) plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components)) plt.clf()
clustering_results = construct_iterative_run( clustering_method=clustering_method, dim_reduce=FastICA(**params.get('ica')), data=data) filename = '{dataset_name}_{step}_{clustering_method}_{reduce_method}.pkl' save_data(clustering_results, path='results/', filename=filename.format(dataset_name=dataset_name, step='clustering_on_reduced', clustering_method=clustering_method, reduce_method='ica')) # RCA print('start RCA:') clustering_results = construct_iterative_run( clustering_method=clustering_method, dim_reduce=SparseRandomProjection(**params.get('sparse_rca')), data=data) filename = '{dataset_name}_{step}_{clustering_method}_{reduce_method}.pkl' save_data(clustering_results, path='results/', filename=filename.format(dataset_name=dataset_name, step='clustering_on_reduced', clustering_method=clustering_method, reduce_method='rca')) # DT ft importance: print('start DT:') path = 'results/' filename = '{dataset_name}_{step}_{method}.pkl' rfc = load_data(path=path, filename=filename.format(dataset_name=dataset_name, step='dim_reduction',
scatterPlot(X_train_GRP, y_train, "Gaussian Random Projection") # In[ ]: # Sparse Random Projection from sklearn.random_projection import SparseRandomProjection n_components = 'auto' density = 'auto' eps = 0.5 dense_output = False random_state = 2018 SRP = SparseRandomProjection(n_components=n_components, density=density, eps=eps, dense_output=dense_output, random_state=random_state) X_train_SRP = SRP.fit_transform(X_train) X_train_SRP = pd.DataFrame(data=X_train_SRP, index=train_index) X_validation_SRP = SRP.transform(X_validation) X_validation_SRP = pd.DataFrame(data=X_validation_SRP, index=validation_index) scatterPlot(X_train_SRP, y_train, "Sparse Random Projection") # In[ ]: # Isomap
def DecomposedFeatures(train, test, total, addtrain, addtest, use_pca=0.0, use_tsvd=0.0, use_ica=0.0, use_fa=0.0, use_grp=0.0, use_srp=0.0, use_pls=0.0): print("\nStart decomposition process...") train_decomposed = [addtrain] test_decomposed = [addtest] if use_pca > 0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) + 1 pca = PCA(n_components=N_COMP, whiten=True, svd_solver="full", random_state=42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) train_decomposed = train_decomposed.append(pca_results_train) test_decomposed = test_decomposed.append(pca_results_test) if use_tsvd > 0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) + 1 tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) train_decomposed = train_decomposed.append(tsvd_results_train) test_decomposed = test_decomposed.append(tsvd_results_test) if use_ica > 0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) + 1 ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) train_decomposed = train_decomposed.append(train_decomposed) test_decomposed = test_decomposed.append(ica_results_test) if use_fa > 0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) + 1 fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) train_decomposed = train_decomposed.append(fa_results_train) test_decomposed = test_decomposed.append(fa_results_test) if use_grp > 0.0: print("GRP") N_COMP = int(use_grp * train.shape[1]) + 1 grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) train_decomposed = train_decomposed.append(grp_results_train) test_decomposed = test_decomposed.append(grp_results_test) if use_srp > 0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) + 1 srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) train_decomposed = train_decomposed.append(srp_results_train) test_decomposed = test_decomposed.append(srp_results_test) if use_pls > 0.0: print("PLS") #N_COMP = int(use_pls * train.shape[1]) +1 #pls = PLSCanonical(n_components = N_COMP) #pls_results = pls.fit(total) #pls_results_train = pls.transform(train) #pls_results_test = pls.transform(test) #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1) #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1) print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate(test_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] np.concatenate([ srp_results_train, grp_results_train, ica_results_train, pca_results_train, tsvd_results_train ], axis=1) # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features
# # 4. Decomposition Feature # So far I've only looked at PCA components, but most kernels look at several decomposition methods, so it may be interesting to look at t-SNE of these 10-50 components of each method instead of 1000 PCA components. Furthermore, it's interesting to see how well we can classify test/train based on this reduced feature space. # # # In[ ]: COMPONENTS = 20 # List of decomposition methods to use methods = [ TruncatedSVD(n_components=COMPONENTS), PCA(n_components=COMPONENTS), FastICA(n_components=COMPONENTS), GaussianRandomProjection(n_components=COMPONENTS, eps=0.1), SparseRandomProjection(n_components=COMPONENTS, dense_output=True) ] # Run all the methods embeddings = [] for method in methods: name = method.__class__.__name__ embeddings.append( pd.DataFrame(method.fit_transform(total_df), columns=[f"{name}_{i}" for i in range(COMPONENTS)])) print(f">> Ran {name}") # Put all components into one dataframe components_df = pd.concat(embeddings, axis=1) # Prepare plot
from sklearn.neural_network import MLPClassifier from dataTransformer import * from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from data import MNIST from sklearn.metrics import accuracy_score from time import time from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection if __name__=="__main__": mnist = MNIST(10000) start = time() pipeline = Pipeline([('Scale', StandardScaler()), ('PCA', SparseRandomProjection(random_state=0, n_components=160)), ('MLP', MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01, verbose=1))]) pipeline.fit(mnist.X_train, mnist.y_train) y_pred = pipeline.predict(mnist.X_test) end = time() print ("time used: {}s".format(end - start)) print (accuracy_score(y_pred, mnist.y_test)) # MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01)
def use_decomposed_features_as_new_df(train, test, total, n_components, use_pca=False, use_tsvd=False, use_ica=False, use_fa=False, use_grp=False, use_srp=False): N_COMP = n_components ntrain = len(train) print("\nStart decomposition process...") if use_pca: print("PCA") pca = PCA(n_components=N_COMP, random_state=42) pca_results = pca.fit_transform(total) pca_results_train = pca_results[:ntrain] pca_results_test = pca_results[ntrain:] if use_tsvd: print("tSVD") tsvd = TruncatedSVD(n_components=N_COMP, random_state=42) tsvd_results = tsvd.fit_transform(total) tsvd_results_train = tsvd_results[:ntrain] tsvd_results_test = tsvd_results[ntrain:] if use_ica: print("ICA") ica = FastICA(n_components=N_COMP, random_state=42) ica_results = ica.fit_transform(total) ica_results_train = ica_results[:ntrain] ica_results_test = ica_results[ntrain:] if use_fa: print("FA") fa = FactorAnalysis(n_components=N_COMP, random_state=42) fa_results = fa.fit_transform(total) fa_results_train = fa_results[:ntrain] fa_results_test = fa_results[ntrain:] if use_grp: print("GRP") grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=42) grp_results = grp.fit_transform(total) grp_results_train = grp_results[:ntrain] grp_results_test = grp_results[ntrain:] if use_srp: print("SRP") srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=42) srp_results = srp.fit_transform(total) srp_results_train = srp_results[:ntrain] srp_results_test = srp_results[ntrain:] print("Append decomposition components together...") train_decomposed = np.concatenate([ srp_results_train, grp_results_train, ica_results_train, pca_results_train, tsvd_results_train ], axis=1) test_decomposed = np.concatenate([ srp_results_test, grp_results_test, ica_results_test, pca_results_test, tsvd_results_test ], axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) for agg_col in [ 'sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min' ]: train_with_only_decomposed_features[col] = train[col] test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna( 0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna( 0) return train_with_only_decomposed_features, test_with_only_decomposed_features
def main(): runit = 1 if runit: run = assignment4() run.read_data_voice('voice.csv') run.dataSetName = 'Voice' run.split_data_to_train_test(testSize=0.3) dataX = StandardScaler().fit_transform(run.allFeatures) ''' run.PCA() run.ICA() run.RP() ''' run.TSVD() run.k_mean_cluster() run.expectation_maximization() pcaCom = 15 icaCom = 15 rpCom = 15 tsvdCom = 15 k = 2 reducedDataPCA = PCA(n_components=pcaCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA') run.expectation_maximization_reduced(k, reducedDataPCA, 'PCA') reducedDataICA = FastICA(n_components=icaCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataICA, 'ICA') run.expectation_maximization_reduced(k, reducedDataICA, 'ICA') reducedDataRP = SparseRandomProjection( n_components=rpCom, random_state=5).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataRP, 'RP') run.expectation_maximization_reduced(k, reducedDataRP, 'RP') reducedDataTSVD = TruncatedSVD( random_state=5, n_components=tsvdCom).fit_transform(dataX) run.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD') run.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD') run_hapt = assignment4() run_hapt.read_data_haptX('HAPT_X.csv') run_hapt.read_data_haptY('HAPT_Y.csv') run_hapt.dataSetName = 'HAPT' dataX = StandardScaler().fit_transform(run_hapt.allFeatures) run_hapt.kNum = range(1, 20, 5) run_hapt.pcaDims = range(1, 561, 25) run_hapt.icaDims = range(1, 561, 25) run_hapt.rpDims = range(1, 561, 25) run_hapt.tvsdDims = range(1, 561, 25) #run_hapt.k_mean_cluster() run_hapt.expectation_maximization() run_hapt.PCA() run_hapt.ICA() run_hapt.RP() run_hapt.TSVD() pcaCom = 15 icaCom = 15 rpCom = 15 tsvdCom = 15 k = 2 reducedDataPCA = PCA(n_components=pcaCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA') run_hapt.expectation_maximization_reduced(k, reducedDataPCA, 'PCA') reducedDataICA = FastICA(n_components=icaCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataICA, 'ICA') run_hapt.expectation_maximization_reduced(k, reducedDataICA, 'ICA') reducedDataRP = SparseRandomProjection(n_components=rpCom, random_state=5).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataRP, 'RP') run_hapt.expectation_maximization_reduced(k, reducedDataRP, 'RP') reducedDataTSVD = TruncatedSVD(random_state=5, n_components=tsvdCom).fit_transform(dataX) run_hapt.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD') run_hapt.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD') print("All done") plt.show()
kurtosis = collections.defaultdict(list) for i in range(1, num_components + 1): kurtosis['num components'].append(i) ica = FastICA(n_components=i) ica_transformed_data = ica.fit_transform(X_default_train) kurtosis['avg kurtosis'].append( pd.DataFrame(data=ica_transformed_data).kurt(axis=0).abs().mean()) kurtosis_df = pd.DataFrame(data=kurtosis) kurtosis_df.to_csv('default_avg_kurtosis.csv') num_components = 16 rp_stats = collections.defaultdict(list) for i in range(1, num_components): rp_stats['num components'].append(i) rp = SparseRandomProjection(n_components=i) nnm = MLPClassifier() rp_nnm = Pipeline([('rp', rp), ('nnm', nnm)]) rp_nnm.fit(X_digits_train, y_digits_train) accuracy_score = metrics.accuracy_score(rp_nnm.predict(X_digits_test), y_digits_test) rp_stats['accuracy score'].append(accuracy_score) rp_df = pd.DataFrame(data=rp_stats) rp_df.to_csv('digits_rp_data.csv') num_components = 23 rp_stats = collections.defaultdict(list) for i in range(1, num_components): rp_stats['num components'].append(i) rp = SparseRandomProjection(n_components=i) nnm = MLPClassifier()
##NEURAL NETWORK from sklearn.neural_network import MLPClassifier #LEARNIGN CURVE PLOT from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100) ###PCA ##X_train = PCA(n_components=3).fit_transform(X_train) ##X_test = PCA(n_components=3).fit_transform(X_test) ####RP X_train = SparseRandomProjection(n_components=3).fit_transform(X_train) X_test = SparseRandomProjection(n_components=3).fit_transform(X_test) mlp = MLPClassifier(activation='logistic', solver='adam', max_iter=260) mlp.fit(X_train, y_train) nn_pred = mlp.predict(X_test) def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate a simple plot of the test and training learning curve. """ plt.figure() plt.title(title) if ylim is not None:
target_names = dataset.target_names print(target_names) print(dataset.images.shape) print(dataset.data.shape) print(dataset.target.shape) print(H * W) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) from sklearn.random_projection import SparseRandomProjection n_components = 80 decomposer = SparseRandomProjection(n_components=n_components).fit(X_train) X_train_d = decomposer.transform(X_train) X_test_d = decomposer.transform(X_test) from sklearn.neural_network import MLPClassifier model = MLPClassifier(hidden_layer_sizes=(1024, ), batch_size=256, verbose=True, early_stopping=True) model.fit(X_train_d, y_train) y_pred = model.predict(X_test_d) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred, target_names=target_names))
def main(): global global_gen_data global total_length with open('feature_select_list.pkl', 'r') as f: feature_select_list = pickle.load(f) #pdb.set_trace() cores = multiprocessing.cpu_count() #21 for file_number in xrange(1): with open('../order_100_data/order_data_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() print('read done:' + str(file_number)) get_all_label(file_list) # cores = multiprocessing.cpu_count() # pool = multiprocessing.Pool(processes=(cores-2)) #pdb.set_trace() #print('length: ',len(all_label_result['usercategories'])) cut_num = 2000 control_feature_length(cut_num) #save_pickle(all_label_result,'all_label.pkl') #pdb.set_trace() for feature in total_list: enc, one_hot = get_all_onehot(feature, list(all_label_result[feature])) all_label_encoder[feature].extend([enc, one_hot]) # rewards = [] # items_id = [] # uin = [] # for file_number in range(2,16): # with open('../order_100_event_data/order_data_id_label_chunk_' + str(file_number), 'r') as f: # file_list = f.readlines() # #pdb.set_trace() # for line in file_list: # line_list = line.split('\t') # #if len(line_list) < 3: # #print(line_list) # rewards.append(line_list[1]) # items_id.append(line_list[0]) # uin.append(line_list[2].strip('\n')) for line in cross_lines: cross_feat = line.strip().split() feat_a = cross_feat[0] feat_b = cross_feat[1] total_length += (feature_length_result[feat_a] * feature_length_result[feat_b]) srp = SparseRandomProjection(n_components=1000) print('total_d_length', total_length) for file_number in xrange(0, 4): rewards = [] items_id = [] uin = [] with open( '../order_new_pool_data/order_data_id_label_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() #pdb.set_trace() for line in file_list: line_list = line.split('\t') #if len(line_list) < 3: #print(line_list) rewards.append(line_list[1]) items_id.append(line_list[0]) uin.append(line_list[2].strip('\n')) with open( '../order_new_pool_data/order_data_chunk_' + str(file_number), 'r') as f: file_list = f.readlines() #pdb.set_trace() gen_data = generate_key_value_data(file_list) with open('../order_new_pool_data/length_chunk_' + str(file_number), 'r') as f: cut_pool_list = pickle.load(f) #gen_data = gen_data[0:100] print('start file: ' + str(file_number)) print('number chunk', len(cut_pool_list) / 4000) chunk_file_number = len(cut_pool_list) / 4000 pdb.set_trace() cut_start_flag = 0 for block_num in range(chunk_file_number): print('-------------------------------') print('strat block: ' + str(block_num + 1)) cut_pool = cut_pool_list[block_num * 4000:(block_num + 1) * 4000] cut_end = sum(cut_pool) print('chunk_range: ', cut_start_flag, cut_end + cut_start_flag) data_todeal = gen_data[cut_start_flag:(cut_end + cut_start_flag)] rewards_todeal = rewards[cut_start_flag:(cut_end + cut_start_flag)] items_todeal = items_id[cut_start_flag:(cut_end + cut_start_flag)] uin_todeal = uin[cut_start_flag:(cut_end + cut_start_flag)] cut_start_flag += cut_end pdb.set_trace()
def _projector(self): return SparseRandomProjection( n_components=self.num_components, density=self.density, eps=self.eps, dense_output=True)
import matplotlib.pyplot as plt import numpy as np from sklearn.cluster import KMeans breast_cancer = pd.read_csv("./breast-cancer-wisconsin.csv") li = list(breast_cancer) breast_cancer = pd.DataFrame(breast_cancer.values, columns=li) Class = li[-1] arr = breast_cancer.values y = arr[:, -1] X = arr[:, 0:-1] clusters = range(2, 15) sp = SparseRandomProjection(n_components=4) output = sp.fit_transform(X) tester = em.ExpectationMaximizationTestCluster(output, y, clusters=range(2, 15), plot=False, stats=True) silhouette_EM, vmeasure_scores = tester.run() tester = kmtc.KMeansTestCluster(output, y, clusters=range(2, 15), plot=False, stats=True) silhouette_kmeans, V_measure = tester.run()
def perform_feature_engineering(train, test, config): for c in train.columns: if len(train[c].value_counts()) == 2: if train[c].mean() < config['SparseThreshold']: del train[c] del test[c] col = list(test.columns) if config['ID'] != True: col.remove('ID') # tSVD if config['tSVD'] == True: tsvd = TruncatedSVD(n_components=config['n_comp']) tsvd_results_train = tsvd.fit_transform(train[col]) tsvd_results_test = tsvd.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1] test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1] # PCA if config['PCA'] == True: pca = PCA(n_components=config['n_comp']) pca2_results_train = pca.fit_transform(train[col]) pca2_results_test = pca.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['pca_' + str(i)] = pca2_results_train[:, i - 1] test['pca_' + str(i)] = pca2_results_test[:, i - 1] # ICA if config['ICA'] == True: ica = FastICA(n_components=config['n_comp']) ica2_results_train = ica.fit_transform(train[col]) ica2_results_test = ica.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['ica_' + str(i)] = ica2_results_train[:, i - 1] test['ica_' + str(i)] = ica2_results_test[:, i - 1] # GRP if config['GRP'] == True: grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1) grp_results_train = grp.fit_transform(train[col]) grp_results_test = grp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['grp_' + str(i)] = grp_results_train[:, i - 1] test['grp_' + str(i)] = grp_results_test[:, i - 1] # SRP if config['SRP'] == True: srp = SparseRandomProjection(n_components=config['n_comp'], dense_output=True, random_state=420) srp_results_train = srp.fit_transform(train[col]) srp_results_test = srp.transform(test[col]) for i in range(1, config['n_comp'] + 1): train['srp_' + str(i)] = srp_results_train[:, i - 1] test['srp_' + str(i)] = srp_results_test[:, i - 1] if config['magic'] == True: magic_mat = train[['ID', 'X0', 'y']] magic_mat = magic_mat.groupby(['X0'])['y'].mean() magic_mat = pd.DataFrame({ 'X0': magic_mat.index, 'magic': list(magic_mat) }) mean_magic = magic_mat['magic'].mean() train = train.merge(magic_mat, on='X0', how='left') test = test.merge(magic_mat, on='X0', how='left') test['magic'] = test['magic'].fillna(mean_magic) return train, test
def optimize_embedding(data_matrix, known_targets=None, min_feature_ratio=.1, n_iter=30, n_repetitions=1): # case for sparse data matrix: use random projection to transform to dense if sp.issparse(data_matrix): logger.info('Convert sparse to dense') logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) from sklearn.random_projection import SparseRandomProjection data_matrix = SparseRandomProjection().fit_transform( data_matrix).toarray() logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) if known_targets is not None: logger.info('Feature selection') logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) new_data_matrix = iterated_semi_supervised_feature_selection( data_matrix, known_targets, min_feature_ratio=min_feature_ratio) if new_data_matrix.shape[1] > 2: data_matrix = new_data_matrix logger.info('Data matrix: %d rows %d cols' % (data_matrix.shape[0], data_matrix.shape[1])) n_instances = data_matrix.shape[0] opts_list = make_opts_list(n_instances, n_iter) # iterate n_iter times to find best parameter configuration best_score = 0 logger.debug('neqs = neighborhood embedding quality score') for i in range(n_iter): random.seed(i) # sample from the options embed_opts = make_embed_opts(opts_list, n_instances) basis_opts = make_basis_opts(opts_list, n_instances) general_opts = make_general_opts() try: # find options with max quality score score_list = [] for it in range(n_repetitions): data_matrix_lowdim,\ link_ids,\ score,\ scores = embed_(data_matrix, embed_opts=embed_opts, basis_opts=basis_opts, change_of_basis=general_opts['change_of_basis']) score_list.append(score) mean_reduced_score = np.mean(score_list) - np.std(score_list) if best_score == 0 or mean_reduced_score > best_score: # best_embed_opts = embed_opts # best_basis_opts = basis_opts # best_change_of_basis = change_of_basis best_data_matrix_lowdim = data_matrix_lowdim best_link_ids = link_ids best_scores = scores best_score = mean_reduced_score mark = '*' else: mark = '' logger.debug('..%.2d/%d neqs: %.3f (%.3f +- %.3f) %s' % (i + 1, n_iter, mean_reduced_score, np.mean(scores), np.std(scores), mark)) except Exception as e: logger.debug('Failed iteration: %s' % e) return best_data_matrix_lowdim, best_link_ids, best_score, best_scores
num_of_features = 6 correlated_features = np.abs(corr).sort_values(by=['Correlations']) descriptive_features = correlated_features.iloc[ len(correlated_features) - num_of_features:len(correlated_features)] new_features = data[descriptive_features.index.values] #%% # ------------------------------------------------------------------------------ # # Additional Visualization Models # ------------------------------------------------------------------------------ # #%% Principal Component Analysis # Since the data is already a PCA of an original data set we visualize the first two vectors. Visualize(features.to_numpy(), labels, "PCA") #%% Sparse Random Projection from sklearn.random_projection import SparseRandomProjection SRP = SparseRandomProjection(n_components=2, dense_output=True, random_state=rand_state) SRP_features = SRP.fit_transform(features) Visualize(SRP_features, labels, "SRP") #%% Gaussian Random Projection from sklearn.random_projection import GaussianRandomProjection GRP = GaussianRandomProjection(n_components=2, random_state=rand_state) GRP_features = GRP.fit_transform(features) Visualize(GRP_features, labels, "GRP") #%% Auto Encoder import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim
# In[ ]: def distance_correlation(X1, X2): assert X1.shape[0] == X2.shape[0] return np.corrcoef( pairwise_distances(X1).ravel(), pairwise_distances(X2).ravel())[0, 1] # In[ ]: tmp = defaultdict(dict) for i, dim in product(range(10), dimensions): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = distance_correlation(rp.fit_transform(X_train), X_train) tmp = pd.DataFrame(tmp).T tmp.to_csv('./P2/IncomeRP_DistanceCorrelation.csv') # In[ ]: # Run Neural Networks rp = SparseRandomProjection(random_state=5) nn_results, clf = run_ann(dimensions, rp, X_train, Y_train) nn_results.to_csv('./P2/IncomeRP_ANN.csv') ## test score test_score = clf.score(X_test, Y_test) print("Test Accuracy = ", test_score)
def DecomposedFeatures(train, test, val, total, addtrain, addtest, use_pca = 0.0, use_tsvd = 0.0, use_ica = 0.0, use_fa = 0.0, use_grp = 0.0, use_srp = 0.0, use_KPCA = 0.0, kernal="rbf"): print("\nStart decomposition process...") train_decomposed = [] test_decomposed = [] val_decomposed = [] if addtrain is not None: train_decomposed = [addtrain] val_decomposed= [val] if addtest is not None: test_decomposed = [addtest] if use_pca>0.0: print("PCA") N_COMP = int(use_pca * train.shape[1]) +1 pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42) pca_results = pca.fit(total) pca_results_train = pca.transform(train) pca_results_test = pca.transform(test) pca_results_val = pca.transform(val) train_decomposed.append(pca_results_train) test_decomposed.append(pca_results_test) val_decomposed.append(pca_results_val) if use_tsvd>0.0: print("tSVD") N_COMP = int(use_tsvd * train.shape[1]) +1 tsvd = TruncatedSVD(n_components = N_COMP, random_state=42) tsvd_results = tsvd.fit(total) tsvd_results_train = tsvd.transform(train) tsvd_results_test = tsvd.transform(test) tsvd_results_val = tsvd.transform(val) train_decomposed.append(tsvd_results_train) test_decomposed.append(tsvd_results_test) val_decomposed.append(tsvd_results_val) if use_ica>0.0: print("ICA") N_COMP = int(use_ica * train.shape[1]) +1 ica = FastICA(n_components = N_COMP, random_state=42) ica_results = ica.fit(total) ica_results_train = ica.transform(train) ica_results_test = ica.transform(test) ica_results_val = ica.transform(val) train_decomposed.append(ica_results_train) test_decomposed.append(ica_results_test) val_decomposed.append(ica_results_val) if use_fa>0.0: print("FA") N_COMP = int(use_fa * train.shape[1]) +1 fa = FactorAnalysis(n_components = N_COMP, random_state=42) fa_results = fa.fit(total) fa_results_train = fa.transform(train) fa_results_test = fa.transform(test) fa_results_val = fa.transform(val) train_decomposed.append(fa_results_train) test_decomposed.append(fa_results_test) val_decomposed.append(fa_results_val) if use_grp>0.0 or use_grp<0.0: print("GRP") if use_grp>0.0: N_COMP = int(use_grp * train.shape[1]) +1 eps=10 if use_grp<0.0: N_COMP = "auto" eps=abs(use_grp) grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42) grp_results = grp.fit(total) grp_results_train = grp.transform(train) grp_results_test = grp.transform(test) grp_results_val = grp.transform(val) train_decomposed.append(grp_results_train) test_decomposed.append(grp_results_test) val_decomposed.append(grp_results_val) if use_srp>0.0: print("SRP") N_COMP = int(use_srp * train.shape[1]) +1 srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42) srp_results = srp.fit(total) srp_results_train = srp.transform(train) srp_results_test = srp.transform(test) srp_results_val = pca.transform(val) train_decomposed.append(srp_results_train) test_decomposed.append(srp_results_test) val_decomposed.append(srp_results_val) if use_KPCA >0.0: print("KPCA") N_COMP = int(use_KPCA * train.shape[1]) +1 #N_COMP = None pls = KernelPCA(n_components = N_COMP,kernel=kernal) pls_results = pls.fit(total) pls_results_train = pls.transform(train) pls_results_test = pls.transform(test) pls_results_val = pls.transform(val) train_decomposed.append(pls_results_train) test_decomposed.append(pls_results_test) val_decomposed.append(pls_results_val) gc.collect() print("Append decomposition components together...") train_decomposed = np.concatenate(train_decomposed, axis=1) test_decomposed = np.concatenate( test_decomposed, axis=1) val_decomposed = np.concatenate( val_decomposed, axis=1) train_with_only_decomposed_features = pd.DataFrame(train_decomposed) test_with_only_decomposed_features = pd.DataFrame(test_decomposed) val_with_only_decomposed_features = pd.DataFrame(val_decomposed) #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']: # train_with_only_decomposed_features[col] = train[col] # test_with_only_decomposed_features[col] = test[col] # Remove any NA train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0) test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0) val_with_only_decomposed_features = val_with_only_decomposed_features.fillna(0) return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
def apply_band_selection(technique, dataset, predictions, mode, n_components, df_column_entry_dict): if df_column_entry_dict is None: df_column_entry_dict = { } # couldn't care less, this is a lazy way to make all accesses work print("Dataset current shape: " + str(dataset.shape)) print_memory_metrics("before applying band selection method " + technique, df_column_entry_dict) from DeepHyperX.batch import PARAMETER_JSON parameterFile = open(PARAMETER_JSON, "r") import json data = json.load(parameterFile) parameterFile.close() if technique in ["IncrementalPCA"]: # requires special method dataset, _ = applyIncrementalPCA(dataset, n_components) elif technique in data["image_compression"]["extraction"]["techniques"]: extraction_object = None if technique == "PCA": from sklearn.decomposition import PCA """ HybridSN: Exploring 3D-2D CNN Feature Hierarchy for Hyperspectral Image Classification Source code used: https://github.com/gokriznastic/HybridSN/blob/master/Hybrid-Spectral-Net.ipynb Paper: https://arxiv.org/abs/1902.06701 Good parameters: 30 components for Indian Pines, 15 for Salinas and Pavia University """ extraction_object = PCA(n_components=n_components, whiten=True) elif technique == "KernelPCA": from sklearn.decomposition import KernelPCA extraction_object = KernelPCA(kernel="rbf", n_components=n_components, gamma=None, fit_inverse_transform=True, n_jobs=1) elif technique == "SparsePCA": """Sparse PCA uses the links between the ACP and the SVD to extract the main components by solving a lower-order matrix approximation problem.""" from sklearn.decomposition import SparsePCA extraction_object = SparsePCA(n_components=n_components, alpha=0.0001, n_jobs=-1) elif technique == "LDA": # only supervised is supported, y is required if mode != "supervised": print( "warning: mode other than supervised detected for lda, setting it to supervised...\n" ) mode = "supervised" # maximally n_classes - 1 columns, https://stackoverflow.com/questions/26963454/lda-ignoring-n-components from sklearn.discriminant_analysis import LinearDiscriminantAnalysis extraction_object = LinearDiscriminantAnalysis( n_components=n_components) elif technique == "SVD": from sklearn.decomposition import TruncatedSVD extraction_object = TruncatedSVD(n_components=n_components, algorithm='randomized', n_iter=5) elif technique == "GRP": from sklearn.random_projection import GaussianRandomProjection extraction_object = GaussianRandomProjection( n_components=n_components, eps=0.5) elif technique == "SRP": from sklearn.random_projection import SparseRandomProjection extraction_object = SparseRandomProjection( n_components=n_components, density='auto', eps=0.5, dense_output=False) elif technique == "MDS": """O(n^3), uses lots of memory for distance matrix (doesn't fit in 48GB), doesn't fit in GPU memory either, so basically unusable""" from sklearn.manifold import MDS extraction_object = MDS(n_components=n_components, n_init=12, max_iter=200, metric=True, n_jobs=16) elif technique == "MiniBatch": """takes too long""" from sklearn.decomposition import MiniBatchDictionaryLearning extraction_object = MiniBatchDictionaryLearning( n_components=n_components, batch_size=200, alpha=1, n_iter=1) elif technique == "LLE": # modified LLE requires n_neighbors >= n_components """execution takes 20 minutes or so, but it does work, just takes a long time""" from sklearn.manifold import LocallyLinearEmbedding extraction_object = LocallyLinearEmbedding( n_components=n_components, n_neighbors=100, method='modified', n_jobs=4) elif technique == "ICA": from sklearn.decomposition import FastICA extraction_object = FastICA(n_components=n_components, algorithm='parallel', whiten=True, max_iter=100) elif technique == "FactorAnalysis": from sklearn.decomposition import FactorAnalysis extraction_object = FactorAnalysis(n_components=n_components) #75 elif technique == "ISOMAP": from sklearn import manifold extraction_object = manifold.Isomap(n_neighbors=5, n_components=n_components, n_jobs=-1) elif technique == "t-SNE": # like PCA, but non-linear (pca is linear) from sklearn.manifold import TSNE extraction_object = TSNE(n_components=n_components, learning_rate=300, perplexity=30, early_exaggeration=12, init='random') elif technique == "UMAP": # install umap-learn for this to work import umap extraction_object = umap.UMAP(n_neighbors=50, min_dist=0.3, n_components=n_components) elif technique == "NMF": # https://www.kaggle.com/remidi/dimensionality-reduction-techniques from sklearn.decomposition import NMF extraction_object = NMF(n_components=n_components, init='nndsvdar', random_state=420) elif technique == "F*G": # super fast and nice from sklearn.cluster import FeatureAgglomeration extraction_object = FeatureAgglomeration(n_clusters=n_components, linkage='ward') else: raise ValueError("Unknown feature extraction technique: " + technique) start_mem_measurement() start = time.time() dataset, _ = applyFeatureExtraction( dataset, predictions, extraction_object, mode, merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3)) time_elapse = time.time() - start event = 'applying band selection method (EXTRACTION) ' + technique formatted_time = str(timedelta(seconds=time_elapse)) df_column_entry_dict['Time measurement at ' + event + ' [s]'] = time_elapse print("\n" + event + " took " + formatted_time + " seconds\n") event = "after applying band selection method " + technique stop_mem_measurement(event, df_column_entry_dict) print_memory_metrics(event, df_column_entry_dict) elif technique in data["image_compression"]["selection"]["techniques"]: selection_object = None if technique == "RandomForest": # Random forests or random decision forests are an ensemble learning method for classification, regression and other # tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.[1][2] Random decision forests correct for decision trees' habit of overfitting to their training set.[3]:587–588 https://en.wikipedia.org/wiki/Random_forest from sklearn.ensemble import RandomForestClassifier selection_object = RandomForestClassifier() elif technique == "LogisticRegression": from sklearn.linear_model import LogisticRegression selection_object = LogisticRegression() elif technique == "LinearRegression": from sklearn.linear_model import LinearRegression selection_object = LinearRegression() elif technique == "LightGBM": from lightgbm import LGBMClassifier selection_object = LGBMClassifier() else: raise ValueError("Unknown feature selection technique: " + technique) start_mem_measurement() start = time.time() dataset, _ = applyFeatureSelection( dataset, predictions, selection_object, n_components, mode, merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3)) time_elapse = time.time() - start event = 'applying band selection method (SELECTION) ' + technique formatted_time = str(timedelta(seconds=time_elapse)) df_column_entry_dict['Time measurement at ' + event + ' [s]'] = time_elapse print("\n" + event + " took " + formatted_time + " seconds\n") event = "after applying band selection method " + technique stop_mem_measurement(event, df_column_entry_dict) print_memory_metrics(event, df_column_entry_dict) print("Dataset new shape: " + str(dataset.shape)) return dataset
X, y = data.iloc[:, :-1], data.iloc[:, -1] X.columns = colnames[:len(colnames) - 1] print johnson_lindenstrauss_min_dim(4601, eps=0.1) split = train_test_split(X, y, test_size=0.3, random_state=42) (trainData, testData, trainTarget, testTarget) = split accuracies = [] components = np.int32(np.linspace(2, 56, 14)) model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) # loop over the projection sizes for comp in components: # create the random projection sp = SparseRandomProjection(n_components=comp) X = sp.fit_transform(trainData) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, trainTarget) # evaluate the model and update the list of accuracies test = sp.transform(testData) accuracies.append(metrics.accuracy_score(model.predict(test), testTarget)) # create the figure plt.figure() plt.suptitle("Accuracy of Sparse Projection on Spam") plt.xlabel("# of Components") plt.ylabel("Accuracy")
from sklearn.manifold import TSNE from sklearn.preprocessing import StandardScaler df = unpickle("../processed/v003/acsf_feat.pkl") SEED = 71 N_COMP = 5 num_clusters2 = 5 fa = FactorAnalysis(n_components=N_COMP, ) pca = PCA(n_components=N_COMP, random_state=SEED) tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED) ica = FastICA(n_components=N_COMP, random_state=SEED) grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED) srp = SparseRandomProjection(n_components=N_COMP, dense_output=True, random_state=SEED) mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED) tsne = TSNE(n_components=3, random_state=SEED) ss = StandardScaler() df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:]) decomp_cols = [] comp_results = [] comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp", "mbkm"] #, "tsne"] # removing tsne for name, transform in zip(comp_names, [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]): print(current_time(), "{} converting...".format(name), flush=True) n_components = N_COMP
def main(): out = './BASE/' cmap = cm.get_cmap('Spectral') np.random.seed(0) letter = pd.read_hdf('./BASE/datasets.hdf', 'letter') letterX = letter.drop('Class', 1).copy().values letterY = letter['Class'].copy().values madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon') madelonX = madelon.drop('Class', 1).copy().values madelonY = madelon['Class'].copy().values madelonX = StandardScaler().fit_transform(madelonX) letterX = StandardScaler().fit_transform(letterX) clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40] dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60] dims2 = [2, 4, 6, 8, 10, 12, 14, 16] #raise #%% data for 1 tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree1.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(madelonX) tmp[dim][i] = reconstructionError(rp, madelonX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'madelon scree2.csv') tmp = defaultdict(dict) for i, dim in product(range(10), dims2): rp = SparseRandomProjection(random_state=i, n_components=dim) rp.fit(letterX) tmp[dim][i] = reconstructionError(rp, letterX) tmp = pd.DataFrame(tmp).T tmp.to_csv(out + 'letter scree2.csv') #%% Data for 2 grid = { 'rp__n_components': dims, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(madelonX, madelonY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'Madelon dim red.csv') grid = { 'rp__n_components': dims2, 'NN__alpha': nn_reg, 'NN__hidden_layer_sizes': nn_arch } rp = SparseRandomProjection(random_state=5) mlp = MLPClassifier(activation='relu', max_iter=2000, early_stopping=True, random_state=5) pipe = Pipeline([('rp', rp), ('NN', mlp)]) gs = GridSearchCV(pipe, grid, verbose=10, cv=5) gs.fit(letterX, letterY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'letter dim red.csv') #raise #%% data for 3 # Set this from chart 2 and dump, use clustering script to finish up dim = 60 rp = SparseRandomProjection(n_components=dim, random_state=5) madelonX2 = rp.fit_transform(madelonX) madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T))) cols = list(range(madelon2.shape[1])) cols[-1] = 'Class' madelon2.columns = cols madelon2.to_hdf(out + 'datasets.hdf', 'madelon', complib='blosc', complevel=9) # dim = 16 rp = SparseRandomProjection(n_components=dim, random_state=5) letterX2 = rp.fit_transform(letterX) letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T))) cols = list(range(letter2.shape[1])) cols[-1] = 'Class' letter2.columns = cols letter2.to_hdf(out + 'datasets.hdf', 'letter', complib='blosc', complevel=9)
# In[4]: from sklearn.decomposition import FactorAnalysis fa = FactorAnalysis(n_components=100, random_state=42) X_fa = fa.fit_transform(X) # In[5]: from sklearn.random_projection import SparseRandomProjection srp = SparseRandomProjection(n_components=100, random_state=42) X_srp = srp.fit_transform(X) # In[6]: from sklearn.random_projection import GaussianRandomProjection grp = GaussianRandomProjection(n_components=100, random_state=42) X_grp = grp.fit_transform(X) # In[7]:
import matplotlib.pyplot as plt # Upload and clean our data data = pd.read_csv('avocado.csv', index_col = 'Date') data = data[data.columns[:-2]] data['type'] = data['type'].replace({'conventional':0, 'organic':1}) print(data.head()) # Fit PCA and apply transformation X = data.iloc[:,:-1].values pca = PCA(n_components=2).fit(X) X_t = pca.transform(X) ''' # Plotting PCA transformed avocado dataset plt.scatter(X_t[:,0],X_t[:,1]) plt.show() ''' # We can also fit random projection! rp = SparseRandomProjection() # n_components and epsilon are chosen for us X2 = X = np.random.rand(100, 10000) # create very large random matrix (our avocado dataset only has 10 features so no need for RP) X_rp = rp.fit_transform(X2) print(X_rp.shape) ''' # Plotting RP transformed random dataset plt.scatter(X_rp[:,0],X_rp[:,1]) plt.show() '''
from numpy.lib.function_base import _interp_dispatcher # from skmultiflow.trees import HoeffdingTree as HT from skmultiflow.lazy import SAMKNN from sklearn.metrics import accuracy_score import time, copy from sklearn.random_projection import SparseRandomProjection from sklearn.metrics import cohen_kappa_score # from skmultiflow.bayes import NaiveBayes from inc_pca import IncPCA from rff_base import Base as RFF from rrslvq import ReactiveRobustSoftLearningVectorQuantization as RRSLVQ from rslvq import RSLVQ from skmultiflow.meta import AdaptiveRandomForest as ARF transformer = SparseRandomProjection(n_components=1000) classes = np.arange(0, 15, 1) res_file = 'res_pca_skipgram.txt' f = open(res_file, 'a+') f.write('SKIP-GRAM\n') f.close() data = np.load('../dataset/skip-gram-embed-w-label.npy') # f = open('data/nasdaq_stream_wo_sentiment.csv') # labels = [] # while 1: # line = f.readline() # if line == '': break # arr = np.array(line.split(','), dtype='float64') # labels.append(arr[1])