示例#1
0
 processor.latext_start_figure()
 X_train, X_test, y_train, y_test, _ = dataset.get_data(model='KMeans')
 n_clusters = len(dataset.label_encoder.classes_)
 pca = PCA(n_components=0.95)
 pca.fit(X_train)
 n_components = pca.components_.shape[0]
 print(f"n_components: {n_components}")
 dr_models = [
     PCA(n_components=n_components, random_state=0),
     FastICA(n_components=n_components, random_state=0),
     MiniBatchDictionaryLearning(n_components=n_components,
                                 alpha=1,
                                 batch_size=200,
                                 n_iter=10,
                                 random_state=0),
     SparseRandomProjection(random_state=0, n_components=n_components)
 ]
 clustering_models = [
     KMeans(n_clusters=n_clusters,
            init='k-means++',
            n_init=10,
            max_iter=600,
            random_state=0,
            tol=0.0001),
     GaussianMixture(n_components=n_clusters,
                     n_init=10,
                     max_iter=600,
                     random_state=0,
                     tol=0.0001)
 ]
 for pca in dr_models:
示例#2
0
split = train_test_split(X, y, test_size = 0.33,
    random_state = 42)
#digits = datasets.load_digits()
#split = train_test_split(digits.data, digits.target, test_size = 0.3,
#    random_state = 42)
(trainData, testData, trainTarget, testTarget) = split

model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)

# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components = comp)
    X_new = sp.fit_transform(trainData)
 
    # train a classifier on the sparse random projection
    model = LinearSVC()
    model.fit(X_new, trainTarget)
 
    # evaluate the model and update the list of accuracies
    test = sp.transform(testData)
    accuracies.append(metrics.accuracy_score(model.predict(test), testTarget))
    
# create the figure
plt.figure()
plt.suptitle("Accuracy of Sparse Projection on Digits")
plt.xlabel("# of Components")
plt.ylabel("Accuracy")
示例#3
0
kpca2_results_train = kpca.fit_transform(train.drop(["y"], axis=1))
kpca2_results_test = kpca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    # train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    # test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['spca_' + str(i)] = spca2_results_train[:, i - 1]
    test['spca_' + str(i)] = spca2_results_test[:, i - 1]
示例#4
0
madelon = pd.read_hdf('./BASE/datasets.hdf','madelon')        
madelonX = madelon.drop('Class',1).copy().values
madelonY = madelon['Class'].copy().values


madelonX = StandardScaler().fit_transform(madelonX)
digitsX= StandardScaler().fit_transform(digitsX)

clusters =  [2,5,10,15,20,25,30,35,40]
dims = [2,5,10,15,20,25,30,35,40,45,50,55,60]
#raise
#%% data for 1

tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'madelon scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'digits scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
def johnson_lindenstrauss(data, data_name):
    # `normed` is being deprecated in favor of `density` in histograms
    if LooseVersion(matplotlib.__version__) >= '2.1':
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    # Part 1: plot the theoretical dependency between n_components_min and
    # n_samples

    # range of admissible distortions
    eps_range = np.linspace(0.1, 0.99, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(eps_range)))

    # range of number of samples (observation) to embed
    n_samples_range = np.logspace(1, 9, 9)

    plt.figure()
    for eps, color in zip(eps_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples_range, eps=eps)
        plt.loglog(n_samples_range, min_n_components, color=color)

    plt.legend(["eps = %0.1f" % eps for eps in eps_range], loc="lower right")
    plt.xlabel("Number of observations to eps-embed")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_samples vs n_components")
    plt.savefig('Figs/02b_rp_comp_samples')

    # range of admissible distortions
    eps_range = np.linspace(0.01, 0.99, 100)

    n_samples_range = np.logspace(2, 6, 5)
    colors = plt.cm.Blues(np.linspace(0.3, 1.0, len(n_samples_range)))

    plt.figure()
    for n_samples, color in zip(n_samples_range, colors):
        min_n_components = johnson_lindenstrauss_min_dim(n_samples, eps=eps_range)
        plt.semilogy(eps_range, min_n_components, color=color)

    plt.legend(["n_samples = %d" % n for n in n_samples_range], loc="upper right")
    plt.xlabel("Distortion eps")
    plt.ylabel("Minimum number of dimensions")
    plt.title("Johnson-Lindenstrauss bounds:\nn_components vs eps")
    plt.savefig('Figs/02b_rp_comp_eps')

    # Part 2: perform sparse random projection of some digits images which are
    # quite low dimensional and dense or documents of the 20 newsgroups dataset
    # which is both high dimensional and sparse

    n_samples, n_features = data.shape
    print("Embedding %d samples with dim %d using various random projections"
        % (n_samples, n_features))

    n_components_range = np.array([1,10,100,1000])
    dists = euclidean_distances(data, squared=True).ravel()

    # select only non-identical samples pairs
    nonzero = dists != 0
    dists = dists[nonzero]

    for n_components in n_components_range:
        t0 = time()
        rp = SparseRandomProjection(n_components=n_components)
        projected_data = rp.fit_transform(data)
        print("Projected %d samples from %d to %d in %0.3fs"
            % (n_samples, n_features, n_components, time() - t0))
        if hasattr(rp, 'components_'):
            n_bytes = rp.components_.data.nbytes
            n_bytes += rp.components_.indices.nbytes
            print("Random matrix with size: %0.3fMB" % (n_bytes / 1e6))

        projected_dists = euclidean_distances(
            projected_data, squared=True).ravel()[nonzero]

        plt.figure()
        plt.hexbin(dists, projected_dists, gridsize=100, cmap=plt.cm.PuBu)
        plt.xlabel("Pairwise squared distances in original space")
        plt.ylabel("Pairwise squared distances in projected space")
        plt.title("Pairwise distances distribution for n_components=%d" %
                n_components)
        cb = plt.colorbar()
        cb.set_label('Sample pairs counts')

        rates = projected_dists / dists
        print("Mean distances rate: %0.2f (%0.2f)"
            % (np.mean(rates), np.std(rates)))
        plt.savefig('Figs/02b_rp_pwdist_{}_{}'.format(data_name, n_components))

        plt.figure()
        plt.hist(rates, bins=50, range=(0., 2.), edgecolor='k', **density_param)
        plt.xlabel("Squared distances rate: projected / original")
        plt.ylabel("Distribution of samples pairs")
        plt.title("Histogram of pairwise distance rates for n_components=%d" %
                n_components)
        plt.savefig('Figs/02b_rp_histogram_{}_{}'.format(data_name, n_components))
        plt.clf()
 clustering_results = construct_iterative_run(
     clustering_method=clustering_method,
     dim_reduce=FastICA(**params.get('ica')),
     data=data)
 filename = '{dataset_name}_{step}_{clustering_method}_{reduce_method}.pkl'
 save_data(clustering_results,
           path='results/',
           filename=filename.format(dataset_name=dataset_name,
                                    step='clustering_on_reduced',
                                    clustering_method=clustering_method,
                                    reduce_method='ica'))
 # RCA
 print('start RCA:')
 clustering_results = construct_iterative_run(
     clustering_method=clustering_method,
     dim_reduce=SparseRandomProjection(**params.get('sparse_rca')),
     data=data)
 filename = '{dataset_name}_{step}_{clustering_method}_{reduce_method}.pkl'
 save_data(clustering_results,
           path='results/',
           filename=filename.format(dataset_name=dataset_name,
                                    step='clustering_on_reduced',
                                    clustering_method=clustering_method,
                                    reduce_method='rca'))
 # DT ft importance:
 print('start DT:')
 path = 'results/'
 filename = '{dataset_name}_{step}_{method}.pkl'
 rfc = load_data(path=path,
                 filename=filename.format(dataset_name=dataset_name,
                                          step='dim_reduction',
示例#7
0
scatterPlot(X_train_GRP, y_train, "Gaussian Random Projection")

# In[ ]:

# Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection

n_components = 'auto'
density = 'auto'
eps = 0.5
dense_output = False
random_state = 2018

SRP = SparseRandomProjection(n_components=n_components,
                             density=density,
                             eps=eps,
                             dense_output=dense_output,
                             random_state=random_state)

X_train_SRP = SRP.fit_transform(X_train)
X_train_SRP = pd.DataFrame(data=X_train_SRP, index=train_index)

X_validation_SRP = SRP.transform(X_validation)
X_validation_SRP = pd.DataFrame(data=X_validation_SRP, index=validation_index)

scatterPlot(X_train_SRP, y_train, "Sparse Random Projection")

# In[ ]:

# Isomap
示例#8
0
def DecomposedFeatures(train,
                       test,
                       total,
                       addtrain,
                       addtest,
                       use_pca=0.0,
                       use_tsvd=0.0,
                       use_ica=0.0,
                       use_fa=0.0,
                       use_grp=0.0,
                       use_srp=0.0,
                       use_pls=0.0):
    print("\nStart decomposition process...")
    train_decomposed = [addtrain]
    test_decomposed = [addtest]
    if use_pca > 0.0:
        print("PCA")
        N_COMP = int(use_pca * train.shape[1]) + 1
        pca = PCA(n_components=N_COMP,
                  whiten=True,
                  svd_solver="full",
                  random_state=42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        train_decomposed = train_decomposed.append(pca_results_train)
        test_decomposed = test_decomposed.append(pca_results_test)

    if use_tsvd > 0.0:
        print("tSVD")
        N_COMP = int(use_tsvd * train.shape[1]) + 1
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        train_decomposed = train_decomposed.append(tsvd_results_train)
        test_decomposed = test_decomposed.append(tsvd_results_test)

    if use_ica > 0.0:
        print("ICA")
        N_COMP = int(use_ica * train.shape[1]) + 1
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        train_decomposed = train_decomposed.append(train_decomposed)
        test_decomposed = test_decomposed.append(ica_results_test)

    if use_fa > 0.0:
        print("FA")
        N_COMP = int(use_fa * train.shape[1]) + 1
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        train_decomposed = train_decomposed.append(fa_results_train)
        test_decomposed = test_decomposed.append(fa_results_test)

    if use_grp > 0.0:
        print("GRP")
        N_COMP = int(use_grp * train.shape[1]) + 1
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        train_decomposed = train_decomposed.append(grp_results_train)
        test_decomposed = test_decomposed.append(grp_results_test)

    if use_srp > 0.0:
        print("SRP")
        N_COMP = int(use_srp * train.shape[1]) + 1
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        train_decomposed = train_decomposed.append(srp_results_train)
        test_decomposed = test_decomposed.append(srp_results_test)

    if use_pls > 0.0:
        print("PLS")
        #N_COMP = int(use_pls  * train.shape[1]) +1
        #pls = PLSCanonical(n_components = N_COMP)
        #pls_results = pls.fit(total)
        #pls_results_train = pls.transform(train)
        #pls_results_test = pls.transform(test)
        #train_decomposed = np.concatenate([pls_results_train,train_decomposed], axis=1)
        #test_decomposed = np.concatenate([pls_results_test, test_decomposed], axis=1)

    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate(test_decomposed, axis=1)
    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    np.concatenate([
        srp_results_train, grp_results_train, ica_results_train,
        pca_results_train, tsvd_results_train
    ],
                   axis=1)
    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features
示例#9
0
# # 4. Decomposition Feature
# So far I've only looked at PCA components, but most kernels look at several decomposition methods, so it may be interesting to look at t-SNE of these 10-50 components of each method instead of 1000 PCA components. Furthermore, it's interesting to see how well we can classify test/train based on this reduced feature space.
#
#

# In[ ]:

COMPONENTS = 20

# List of decomposition methods to use
methods = [
    TruncatedSVD(n_components=COMPONENTS),
    PCA(n_components=COMPONENTS),
    FastICA(n_components=COMPONENTS),
    GaussianRandomProjection(n_components=COMPONENTS, eps=0.1),
    SparseRandomProjection(n_components=COMPONENTS, dense_output=True)
]

# Run all the methods
embeddings = []
for method in methods:
    name = method.__class__.__name__
    embeddings.append(
        pd.DataFrame(method.fit_transform(total_df),
                     columns=[f"{name}_{i}" for i in range(COMPONENTS)]))
    print(f">> Ran {name}")

# Put all components into one dataframe
components_df = pd.concat(embeddings, axis=1)

# Prepare plot
from sklearn.neural_network import MLPClassifier
from dataTransformer import *
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from data import MNIST
from sklearn.metrics import accuracy_score
from time import time
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection


if __name__=="__main__":
    mnist = MNIST(10000)
    start = time()
    pipeline = Pipeline([('Scale', StandardScaler()), ('PCA', SparseRandomProjection(random_state=0, n_components=160)),
                         ('MLP', MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01, verbose=1))])

    pipeline.fit(mnist.X_train, mnist.y_train)
    y_pred = pipeline.predict(mnist.X_test)
    end = time()

    print ("time used: {}s".format(end - start))
    print (accuracy_score(y_pred, mnist.y_test))
# MLPClassifier(hidden_layer_sizes=(512, 256), alpha=0.01)
示例#11
0
def use_decomposed_features_as_new_df(train,
                                      test,
                                      total,
                                      n_components,
                                      use_pca=False,
                                      use_tsvd=False,
                                      use_ica=False,
                                      use_fa=False,
                                      use_grp=False,
                                      use_srp=False):
    N_COMP = n_components
    ntrain = len(train)

    print("\nStart decomposition process...")

    if use_pca:
        print("PCA")
        pca = PCA(n_components=N_COMP, random_state=42)
        pca_results = pca.fit_transform(total)
        pca_results_train = pca_results[:ntrain]
        pca_results_test = pca_results[ntrain:]

    if use_tsvd:
        print("tSVD")
        tsvd = TruncatedSVD(n_components=N_COMP, random_state=42)
        tsvd_results = tsvd.fit_transform(total)
        tsvd_results_train = tsvd_results[:ntrain]
        tsvd_results_test = tsvd_results[ntrain:]

    if use_ica:
        print("ICA")
        ica = FastICA(n_components=N_COMP, random_state=42)
        ica_results = ica.fit_transform(total)
        ica_results_train = ica_results[:ntrain]
        ica_results_test = ica_results[ntrain:]

    if use_fa:
        print("FA")
        fa = FactorAnalysis(n_components=N_COMP, random_state=42)
        fa_results = fa.fit_transform(total)
        fa_results_train = fa_results[:ntrain]
        fa_results_test = fa_results[ntrain:]

    if use_grp:
        print("GRP")
        grp = GaussianRandomProjection(n_components=N_COMP,
                                       eps=0.1,
                                       random_state=42)
        grp_results = grp.fit_transform(total)
        grp_results_train = grp_results[:ntrain]
        grp_results_test = grp_results[ntrain:]

    if use_srp:
        print("SRP")
        srp = SparseRandomProjection(n_components=N_COMP,
                                     dense_output=True,
                                     random_state=42)
        srp_results = srp.fit_transform(total)
        srp_results_train = srp_results[:ntrain]
        srp_results_test = srp_results[ntrain:]

    print("Append decomposition components together...")
    train_decomposed = np.concatenate([
        srp_results_train, grp_results_train, ica_results_train,
        pca_results_train, tsvd_results_train
    ],
                                      axis=1)
    test_decomposed = np.concatenate([
        srp_results_test, grp_results_test, ica_results_test, pca_results_test,
        tsvd_results_test
    ],
                                     axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)

    for agg_col in [
            'sum', 'var', 'mean', 'median', 'std', 'weight_count',
            'count_non_0', 'num_different', 'max', 'min'
    ]:
        train_with_only_decomposed_features[col] = train[col]
        test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(
        0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(
        0)

    return train_with_only_decomposed_features, test_with_only_decomposed_features
示例#12
0
def main():
    runit = 1
    if runit:
        run = assignment4()
        run.read_data_voice('voice.csv')
        run.dataSetName = 'Voice'
        run.split_data_to_train_test(testSize=0.3)
        dataX = StandardScaler().fit_transform(run.allFeatures)
        ''' 
    run.PCA()
    run.ICA()
    run.RP()
    '''
        run.TSVD()
        run.k_mean_cluster()
        run.expectation_maximization()
        pcaCom = 15
        icaCom = 15
        rpCom = 15
        tsvdCom = 15
        k = 2
        reducedDataPCA = PCA(n_components=pcaCom,
                             random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA')
        run.expectation_maximization_reduced(k, reducedDataPCA, 'PCA')

        reducedDataICA = FastICA(n_components=icaCom,
                                 random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataICA, 'ICA')
        run.expectation_maximization_reduced(k, reducedDataICA, 'ICA')

        reducedDataRP = SparseRandomProjection(
            n_components=rpCom, random_state=5).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataRP, 'RP')
        run.expectation_maximization_reduced(k, reducedDataRP, 'RP')

        reducedDataTSVD = TruncatedSVD(
            random_state=5, n_components=tsvdCom).fit_transform(dataX)
        run.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD')
        run.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD')

    run_hapt = assignment4()
    run_hapt.read_data_haptX('HAPT_X.csv')
    run_hapt.read_data_haptY('HAPT_Y.csv')
    run_hapt.dataSetName = 'HAPT'
    dataX = StandardScaler().fit_transform(run_hapt.allFeatures)

    run_hapt.kNum = range(1, 20, 5)
    run_hapt.pcaDims = range(1, 561, 25)
    run_hapt.icaDims = range(1, 561, 25)
    run_hapt.rpDims = range(1, 561, 25)
    run_hapt.tvsdDims = range(1, 561, 25)

    #run_hapt.k_mean_cluster()
    run_hapt.expectation_maximization()

    run_hapt.PCA()
    run_hapt.ICA()
    run_hapt.RP()
    run_hapt.TSVD()

    pcaCom = 15
    icaCom = 15
    rpCom = 15
    tsvdCom = 15
    k = 2
    reducedDataPCA = PCA(n_components=pcaCom,
                         random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataPCA, 'PCA')
    run_hapt.expectation_maximization_reduced(k, reducedDataPCA, 'PCA')

    reducedDataICA = FastICA(n_components=icaCom,
                             random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataICA, 'ICA')
    run_hapt.expectation_maximization_reduced(k, reducedDataICA, 'ICA')

    reducedDataRP = SparseRandomProjection(n_components=rpCom,
                                           random_state=5).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataRP, 'RP')
    run_hapt.expectation_maximization_reduced(k, reducedDataRP, 'RP')

    reducedDataTSVD = TruncatedSVD(random_state=5,
                                   n_components=tsvdCom).fit_transform(dataX)
    run_hapt.k_mean_cluster_reduced(k, reducedDataTSVD, 'TSVD')
    run_hapt.expectation_maximization_reduced(k, reducedDataTSVD, 'TSVD')

    print("All done")
    plt.show()
示例#13
0
kurtosis = collections.defaultdict(list)
for i in range(1, num_components + 1):
    kurtosis['num components'].append(i)
    ica = FastICA(n_components=i)
    ica_transformed_data = ica.fit_transform(X_default_train)
    kurtosis['avg kurtosis'].append(
        pd.DataFrame(data=ica_transformed_data).kurt(axis=0).abs().mean())
kurtosis_df = pd.DataFrame(data=kurtosis)
kurtosis_df.to_csv('default_avg_kurtosis.csv')

num_components = 16
rp_stats = collections.defaultdict(list)
for i in range(1, num_components):
    rp_stats['num components'].append(i)
    rp = SparseRandomProjection(n_components=i)
    nnm = MLPClassifier()
    rp_nnm = Pipeline([('rp', rp), ('nnm', nnm)])
    rp_nnm.fit(X_digits_train, y_digits_train)
    accuracy_score = metrics.accuracy_score(rp_nnm.predict(X_digits_test),
                                            y_digits_test)
    rp_stats['accuracy score'].append(accuracy_score)
rp_df = pd.DataFrame(data=rp_stats)
rp_df.to_csv('digits_rp_data.csv')

num_components = 23
rp_stats = collections.defaultdict(list)
for i in range(1, num_components):
    rp_stats['num components'].append(i)
    rp = SparseRandomProjection(n_components=i)
    nnm = MLPClassifier()
示例#14
0
##NEURAL NETWORK
from sklearn.neural_network import MLPClassifier
#LEARNIGN CURVE PLOT
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.cross_validation import train_test_split


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)

###PCA
##X_train = PCA(n_components=3).fit_transform(X_train)
##X_test = PCA(n_components=3).fit_transform(X_test)

####RP
X_train = SparseRandomProjection(n_components=3).fit_transform(X_train)
X_test = SparseRandomProjection(n_components=3).fit_transform(X_test)

mlp = MLPClassifier(activation='logistic', solver='adam', max_iter=260)
mlp.fit(X_train, y_train)  
nn_pred = mlp.predict(X_test)


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
示例#15
0
target_names = dataset.target_names

print(target_names)

print(dataset.images.shape)
print(dataset.data.shape)
print(dataset.target.shape)

print(H * W)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

from sklearn.random_projection import SparseRandomProjection
n_components = 80
decomposer = SparseRandomProjection(n_components=n_components).fit(X_train)

X_train_d = decomposer.transform(X_train)
X_test_d = decomposer.transform(X_test)

from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(1024, ),
                      batch_size=256,
                      verbose=True,
                      early_stopping=True)
model.fit(X_train_d, y_train)

y_pred = model.predict(X_test_d)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=target_names))
def main():
    global global_gen_data
    global total_length
    with open('feature_select_list.pkl', 'r') as f:
        feature_select_list = pickle.load(f)
    #pdb.set_trace()
    cores = multiprocessing.cpu_count()
    #21
    for file_number in xrange(1):
        with open('../order_100_data/order_data_chunk_' + str(file_number),
                  'r') as f:
            file_list = f.readlines()
            print('read done:' + str(file_number))
            get_all_label(file_list)
#    cores = multiprocessing.cpu_count()
#    pool = multiprocessing.Pool(processes=(cores-2))

#pdb.set_trace()
#print('length: ',len(all_label_result['usercategories']))
    cut_num = 2000
    control_feature_length(cut_num)
    #save_pickle(all_label_result,'all_label.pkl')
    #pdb.set_trace()
    for feature in total_list:
        enc, one_hot = get_all_onehot(feature, list(all_label_result[feature]))
        all_label_encoder[feature].extend([enc, one_hot])
# rewards = []
# items_id = []
# uin = []
# for file_number in range(2,16):
#     with open('../order_100_event_data/order_data_id_label_chunk_' + str(file_number), 'r') as f:
#         file_list = f.readlines()
#         #pdb.set_trace()
#         for line in file_list:
#             line_list = line.split('\t')
#             #if len(line_list) < 3:
#                 #print(line_list)
#             rewards.append(line_list[1])
#             items_id.append(line_list[0])
#             uin.append(line_list[2].strip('\n'))

    for line in cross_lines:
        cross_feat = line.strip().split()
        feat_a = cross_feat[0]
        feat_b = cross_feat[1]
        total_length += (feature_length_result[feat_a] *
                         feature_length_result[feat_b])

    srp = SparseRandomProjection(n_components=1000)
    print('total_d_length', total_length)
    for file_number in xrange(0, 4):
        rewards = []
        items_id = []
        uin = []
        with open(
                '../order_new_pool_data/order_data_id_label_chunk_' +
                str(file_number), 'r') as f:
            file_list = f.readlines()
            #pdb.set_trace()
            for line in file_list:
                line_list = line.split('\t')
                #if len(line_list) < 3:
                #print(line_list)
                rewards.append(line_list[1])
                items_id.append(line_list[0])
                uin.append(line_list[2].strip('\n'))
        with open(
                '../order_new_pool_data/order_data_chunk_' + str(file_number),
                'r') as f:
            file_list = f.readlines()
            #pdb.set_trace()
            gen_data = generate_key_value_data(file_list)
        with open('../order_new_pool_data/length_chunk_' + str(file_number),
                  'r') as f:
            cut_pool_list = pickle.load(f)
        #gen_data = gen_data[0:100]
        print('start file: ' + str(file_number))
        print('number chunk', len(cut_pool_list) / 4000)
        chunk_file_number = len(cut_pool_list) / 4000
        pdb.set_trace()
        cut_start_flag = 0
        for block_num in range(chunk_file_number):
            print('-------------------------------')
            print('strat block: ' + str(block_num + 1))
            cut_pool = cut_pool_list[block_num * 4000:(block_num + 1) * 4000]
            cut_end = sum(cut_pool)
            print('chunk_range: ', cut_start_flag, cut_end + cut_start_flag)
            data_todeal = gen_data[cut_start_flag:(cut_end + cut_start_flag)]
            rewards_todeal = rewards[cut_start_flag:(cut_end + cut_start_flag)]
            items_todeal = items_id[cut_start_flag:(cut_end + cut_start_flag)]
            uin_todeal = uin[cut_start_flag:(cut_end + cut_start_flag)]
            cut_start_flag += cut_end
            pdb.set_trace()
示例#17
0
 def _projector(self):
     return SparseRandomProjection(
         n_components=self.num_components,
         density=self.density,
         eps=self.eps,
         dense_output=True)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans

breast_cancer = pd.read_csv("./breast-cancer-wisconsin.csv")
li = list(breast_cancer)
breast_cancer = pd.DataFrame(breast_cancer.values, columns=li)

Class = li[-1]

arr = breast_cancer.values
y = arr[:, -1]
X = arr[:, 0:-1]
clusters = range(2, 15)

sp = SparseRandomProjection(n_components=4)
output = sp.fit_transform(X)

tester = em.ExpectationMaximizationTestCluster(output,
                                               y,
                                               clusters=range(2, 15),
                                               plot=False,
                                               stats=True)
silhouette_EM, vmeasure_scores = tester.run()

tester = kmtc.KMeansTestCluster(output,
                                y,
                                clusters=range(2, 15),
                                plot=False,
                                stats=True)
silhouette_kmeans, V_measure = tester.run()
示例#19
0
def perform_feature_engineering(train, test, config):

    for c in train.columns:
        if len(train[c].value_counts()) == 2:
            if train[c].mean() < config['SparseThreshold']:
                del train[c]
                del test[c]

    col = list(test.columns)
    if config['ID'] != True:
        col.remove('ID')

    # tSVD
    if config['tSVD'] == True:
        tsvd = TruncatedSVD(n_components=config['n_comp'])
        tsvd_results_train = tsvd.fit_transform(train[col])
        tsvd_results_test = tsvd.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
            test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]
    # PCA
    if config['PCA'] == True:
        pca = PCA(n_components=config['n_comp'])
        pca2_results_train = pca.fit_transform(train[col])
        pca2_results_test = pca.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['pca_' + str(i)] = pca2_results_train[:, i - 1]
            test['pca_' + str(i)] = pca2_results_test[:, i - 1]
    # ICA
    if config['ICA'] == True:
        ica = FastICA(n_components=config['n_comp'])
        ica2_results_train = ica.fit_transform(train[col])
        ica2_results_test = ica.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['ica_' + str(i)] = ica2_results_train[:, i - 1]
            test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    # GRP
    if config['GRP'] == True:
        grp = GaussianRandomProjection(n_components=config['n_comp'], eps=0.1)
        grp_results_train = grp.fit_transform(train[col])
        grp_results_test = grp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['grp_' + str(i)] = grp_results_train[:, i - 1]
            test['grp_' + str(i)] = grp_results_test[:, i - 1]

    # SRP
    if config['SRP'] == True:
        srp = SparseRandomProjection(n_components=config['n_comp'],
                                     dense_output=True,
                                     random_state=420)
        srp_results_train = srp.fit_transform(train[col])
        srp_results_test = srp.transform(test[col])
        for i in range(1, config['n_comp'] + 1):
            train['srp_' + str(i)] = srp_results_train[:, i - 1]
            test['srp_' + str(i)] = srp_results_test[:, i - 1]

    if config['magic'] == True:
        magic_mat = train[['ID', 'X0', 'y']]
        magic_mat = magic_mat.groupby(['X0'])['y'].mean()
        magic_mat = pd.DataFrame({
            'X0': magic_mat.index,
            'magic': list(magic_mat)
        })
        mean_magic = magic_mat['magic'].mean()
        train = train.merge(magic_mat, on='X0', how='left')
        test = test.merge(magic_mat, on='X0', how='left')
        test['magic'] = test['magic'].fillna(mean_magic)
    return train, test
示例#20
0
def optimize_embedding(data_matrix,
                       known_targets=None,
                       min_feature_ratio=.1,
                       n_iter=30,
                       n_repetitions=1):
    # case for sparse data matrix: use random projection to transform to dense
    if sp.issparse(data_matrix):
        logger.info('Convert sparse to dense')
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        from sklearn.random_projection import SparseRandomProjection
        data_matrix = SparseRandomProjection().fit_transform(
            data_matrix).toarray()
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))

    if known_targets is not None:
        logger.info('Feature selection')
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))
        new_data_matrix = iterated_semi_supervised_feature_selection(
            data_matrix, known_targets, min_feature_ratio=min_feature_ratio)
        if new_data_matrix.shape[1] > 2:
            data_matrix = new_data_matrix
        logger.info('Data matrix: %d rows  %d cols' %
                    (data_matrix.shape[0], data_matrix.shape[1]))

    n_instances = data_matrix.shape[0]
    opts_list = make_opts_list(n_instances, n_iter)
    # iterate n_iter times to find best parameter configuration
    best_score = 0
    logger.debug('neqs = neighborhood embedding quality score')
    for i in range(n_iter):
        random.seed(i)
        # sample from the options
        embed_opts = make_embed_opts(opts_list, n_instances)
        basis_opts = make_basis_opts(opts_list, n_instances)
        general_opts = make_general_opts()
        try:
            # find options with max quality score
            score_list = []
            for it in range(n_repetitions):
                data_matrix_lowdim,\
                    link_ids,\
                    score,\
                    scores = embed_(data_matrix,
                                    embed_opts=embed_opts,
                                    basis_opts=basis_opts,
                                    change_of_basis=general_opts['change_of_basis'])
                score_list.append(score)
            mean_reduced_score = np.mean(score_list) - np.std(score_list)
            if best_score == 0 or mean_reduced_score > best_score:
                # best_embed_opts = embed_opts
                # best_basis_opts = basis_opts
                # best_change_of_basis = change_of_basis
                best_data_matrix_lowdim = data_matrix_lowdim
                best_link_ids = link_ids
                best_scores = scores
                best_score = mean_reduced_score
                mark = '*'
            else:
                mark = ''
            logger.debug('..%.2d/%d   neqs: %.3f (%.3f +- %.3f)  %s' %
                         (i + 1, n_iter, mean_reduced_score, np.mean(scores),
                          np.std(scores), mark))
        except Exception as e:
            logger.debug('Failed iteration: %s' % e)
    return best_data_matrix_lowdim, best_link_ids, best_score, best_scores
示例#21
0
num_of_features = 6
correlated_features = np.abs(corr).sort_values(by=['Correlations'])
descriptive_features = correlated_features.iloc[
    len(correlated_features) - num_of_features:len(correlated_features)]
new_features = data[descriptive_features.index.values]
#%%
# ------------------------------------------------------------------------------ #
# Additional Visualization Models
# ------------------------------------------------------------------------------ #
#%% Principal Component Analysis
# Since the data is already a PCA of an original data set we visualize the first two vectors.
Visualize(features.to_numpy(), labels, "PCA")
#%% Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection
SRP = SparseRandomProjection(n_components=2,
                             dense_output=True,
                             random_state=rand_state)
SRP_features = SRP.fit_transform(features)
Visualize(SRP_features, labels, "SRP")

#%% Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection
GRP = GaussianRandomProjection(n_components=2, random_state=rand_state)
GRP_features = GRP.fit_transform(features)
Visualize(GRP_features, labels, "GRP")

#%% Auto Encoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
示例#22
0
# In[ ]:


def distance_correlation(X1, X2):
    assert X1.shape[0] == X2.shape[0]
    return np.corrcoef(
        pairwise_distances(X1).ravel(),
        pairwise_distances(X2).ravel())[0, 1]


# In[ ]:

tmp = defaultdict(dict)
for i, dim in product(range(10), dimensions):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = distance_correlation(rp.fit_transform(X_train), X_train)

tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2/IncomeRP_DistanceCorrelation.csv')

# In[ ]:

# Run Neural Networks
rp = SparseRandomProjection(random_state=5)
nn_results, clf = run_ann(dimensions, rp, X_train, Y_train)
nn_results.to_csv('./P2/IncomeRP_ANN.csv')

## test score
test_score = clf.score(X_test, Y_test)
print("Test Accuracy = ", test_score)
示例#23
0
def DecomposedFeatures(train,  test, val,
                                total,
                                addtrain,
                                addtest,
                                use_pca = 0.0,
                                use_tsvd = 0.0,
                                use_ica = 0.0,
                                use_fa = 0.0,
                                use_grp = 0.0,
                                use_srp = 0.0,
                                use_KPCA = 0.0,
                      kernal="rbf"):
    print("\nStart decomposition process...")
    train_decomposed = []
    test_decomposed = []
    val_decomposed = []
    
    if addtrain is not None:
        train_decomposed = [addtrain]
        val_decomposed= [val]
    if addtest is not None:
        test_decomposed = [addtest]
    
    if use_pca>0.0:
        print("PCA")
        N_COMP = int(use_pca  * train.shape[1]) +1
        pca = PCA(n_components = N_COMP, whiten=True, svd_solver="full", random_state = 42)
        pca_results = pca.fit(total)
        pca_results_train = pca.transform(train)
        pca_results_test = pca.transform(test)
        pca_results_val = pca.transform(val)
        train_decomposed.append(pca_results_train)
        test_decomposed.append(pca_results_test)
        val_decomposed.append(pca_results_val)

    if use_tsvd>0.0:
        print("tSVD")
        N_COMP = int(use_tsvd  * train.shape[1]) +1
        tsvd = TruncatedSVD(n_components = N_COMP, random_state=42)
        tsvd_results = tsvd.fit(total)
        tsvd_results_train = tsvd.transform(train)
        tsvd_results_test = tsvd.transform(test)
        tsvd_results_val = tsvd.transform(val)
        
        train_decomposed.append(tsvd_results_train)
        test_decomposed.append(tsvd_results_test)
        val_decomposed.append(tsvd_results_val)

    if use_ica>0.0:
        print("ICA")
        N_COMP = int(use_ica  * train.shape[1]) +1
        ica = FastICA(n_components = N_COMP, random_state=42)
        ica_results = ica.fit(total)
        ica_results_train = ica.transform(train)
        ica_results_test = ica.transform(test)
        ica_results_val = ica.transform(val)

        train_decomposed.append(ica_results_train)
        test_decomposed.append(ica_results_test)
        val_decomposed.append(ica_results_val)

    if use_fa>0.0:
        print("FA")
        N_COMP = int(use_fa  * train.shape[1]) +1
        fa = FactorAnalysis(n_components = N_COMP, random_state=42)
        fa_results = fa.fit(total)
        fa_results_train = fa.transform(train)
        fa_results_test = fa.transform(test)
        fa_results_val = fa.transform(val)
        
        train_decomposed.append(fa_results_train)
        test_decomposed.append(fa_results_test)
        val_decomposed.append(fa_results_val)

    if use_grp>0.0 or use_grp<0.0:
        print("GRP")
        if use_grp>0.0:
            N_COMP = int(use_grp  * train.shape[1]) +1
            eps=10
        if use_grp<0.0:
            N_COMP = "auto"
            eps=abs(use_grp)
        grp = GaussianRandomProjection(n_components = N_COMP, eps=eps, random_state=42)
        grp_results = grp.fit(total)
        grp_results_train = grp.transform(train)
        grp_results_test = grp.transform(test)
        grp_results_val = grp.transform(val)
      
        train_decomposed.append(grp_results_train)
        test_decomposed.append(grp_results_test)
        val_decomposed.append(grp_results_val)
        

    if use_srp>0.0:
        print("SRP")
        N_COMP = int(use_srp  * train.shape[1]) +1
        srp = SparseRandomProjection(n_components = N_COMP, dense_output=True, random_state=42)
        srp_results = srp.fit(total)
        srp_results_train = srp.transform(train)
        srp_results_test = srp.transform(test)
        srp_results_val = pca.transform(val)

        train_decomposed.append(srp_results_train)
        test_decomposed.append(srp_results_test)
        val_decomposed.append(srp_results_val)

    if use_KPCA >0.0:
        print("KPCA")
        N_COMP = int(use_KPCA  * train.shape[1]) +1
        #N_COMP = None
        pls = KernelPCA(n_components = N_COMP,kernel=kernal)
        pls_results = pls.fit(total)
        pls_results_train = pls.transform(train)
        pls_results_test = pls.transform(test)
        pls_results_val = pls.transform(val)
        train_decomposed.append(pls_results_train)
        test_decomposed.append(pls_results_test)
        val_decomposed.append(pls_results_val)
        gc.collect()
        
    print("Append decomposition components together...")

    train_decomposed = np.concatenate(train_decomposed, axis=1)
    test_decomposed = np.concatenate( test_decomposed, axis=1)
    val_decomposed = np.concatenate( val_decomposed, axis=1)

    train_with_only_decomposed_features = pd.DataFrame(train_decomposed)
    test_with_only_decomposed_features = pd.DataFrame(test_decomposed)
    val_with_only_decomposed_features = pd.DataFrame(val_decomposed)

    #for agg_col in ['sum', 'var', 'mean', 'median', 'std', 'weight_count', 'count_non_0', 'num_different', 'max', 'min']:
    #    train_with_only_decomposed_features[col] = train[col]
    #    test_with_only_decomposed_features[col] = test[col]

    # Remove any NA
    train_with_only_decomposed_features = train_with_only_decomposed_features.fillna(0)
    test_with_only_decomposed_features = test_with_only_decomposed_features.fillna(0)
    val_with_only_decomposed_features  = val_with_only_decomposed_features.fillna(0)
    return train_with_only_decomposed_features, test_with_only_decomposed_features, val_with_only_decomposed_features
示例#24
0
def apply_band_selection(technique, dataset, predictions, mode, n_components,
                         df_column_entry_dict):
    if df_column_entry_dict is None:
        df_column_entry_dict = {
        }  # couldn't care less, this is a lazy way to make all accesses work

    print("Dataset current shape: " + str(dataset.shape))

    print_memory_metrics("before applying band selection method " + technique,
                         df_column_entry_dict)

    from DeepHyperX.batch import PARAMETER_JSON
    parameterFile = open(PARAMETER_JSON, "r")
    import json
    data = json.load(parameterFile)
    parameterFile.close()

    if technique in ["IncrementalPCA"]:  # requires special method
        dataset, _ = applyIncrementalPCA(dataset, n_components)

    elif technique in data["image_compression"]["extraction"]["techniques"]:

        extraction_object = None
        if technique == "PCA":
            from sklearn.decomposition import PCA
            """ HybridSN: Exploring 3D-2D CNN Feature Hierarchy for Hyperspectral Image Classification
            Source code used: https://github.com/gokriznastic/HybridSN/blob/master/Hybrid-Spectral-Net.ipynb
            Paper: https://arxiv.org/abs/1902.06701
            Good parameters: 30 components for Indian Pines, 15 for Salinas and Pavia University
            """
            extraction_object = PCA(n_components=n_components, whiten=True)
        elif technique == "KernelPCA":
            from sklearn.decomposition import KernelPCA
            extraction_object = KernelPCA(kernel="rbf",
                                          n_components=n_components,
                                          gamma=None,
                                          fit_inverse_transform=True,
                                          n_jobs=1)
        elif technique == "SparsePCA":
            """Sparse PCA uses the links between the ACP and the SVD to extract the main components by solving a lower-order matrix approximation problem."""
            from sklearn.decomposition import SparsePCA
            extraction_object = SparsePCA(n_components=n_components,
                                          alpha=0.0001,
                                          n_jobs=-1)
        elif technique == "LDA":  # only supervised is supported, y is required
            if mode != "supervised":
                print(
                    "warning: mode other than supervised detected for lda, setting it to supervised...\n"
                )
                mode = "supervised"
            # maximally n_classes - 1 columns, https://stackoverflow.com/questions/26963454/lda-ignoring-n-components
            from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
            extraction_object = LinearDiscriminantAnalysis(
                n_components=n_components)
        elif technique == "SVD":
            from sklearn.decomposition import TruncatedSVD
            extraction_object = TruncatedSVD(n_components=n_components,
                                             algorithm='randomized',
                                             n_iter=5)
        elif technique == "GRP":
            from sklearn.random_projection import GaussianRandomProjection
            extraction_object = GaussianRandomProjection(
                n_components=n_components, eps=0.5)
        elif technique == "SRP":
            from sklearn.random_projection import SparseRandomProjection
            extraction_object = SparseRandomProjection(
                n_components=n_components,
                density='auto',
                eps=0.5,
                dense_output=False)
        elif technique == "MDS":
            """O(n^3), uses lots of memory for distance matrix (doesn't fit in 48GB), doesn't fit in GPU memory either, so basically unusable"""
            from sklearn.manifold import MDS
            extraction_object = MDS(n_components=n_components,
                                    n_init=12,
                                    max_iter=200,
                                    metric=True,
                                    n_jobs=16)
        elif technique == "MiniBatch":
            """takes too long"""
            from sklearn.decomposition import MiniBatchDictionaryLearning
            extraction_object = MiniBatchDictionaryLearning(
                n_components=n_components, batch_size=200, alpha=1, n_iter=1)
        elif technique == "LLE":
            # modified LLE requires n_neighbors >= n_components
            """execution takes 20 minutes or so, but it does work, just takes a long time"""
            from sklearn.manifold import LocallyLinearEmbedding
            extraction_object = LocallyLinearEmbedding(
                n_components=n_components,
                n_neighbors=100,
                method='modified',
                n_jobs=4)
        elif technique == "ICA":
            from sklearn.decomposition import FastICA
            extraction_object = FastICA(n_components=n_components,
                                        algorithm='parallel',
                                        whiten=True,
                                        max_iter=100)
        elif technique == "FactorAnalysis":
            from sklearn.decomposition import FactorAnalysis
            extraction_object = FactorAnalysis(n_components=n_components)  #75
        elif technique == "ISOMAP":
            from sklearn import manifold
            extraction_object = manifold.Isomap(n_neighbors=5,
                                                n_components=n_components,
                                                n_jobs=-1)
        elif technique == "t-SNE":
            # like PCA, but non-linear (pca is linear)
            from sklearn.manifold import TSNE
            extraction_object = TSNE(n_components=n_components,
                                     learning_rate=300,
                                     perplexity=30,
                                     early_exaggeration=12,
                                     init='random')
        elif technique == "UMAP":
            # install umap-learn for this to work
            import umap
            extraction_object = umap.UMAP(n_neighbors=50,
                                          min_dist=0.3,
                                          n_components=n_components)
        elif technique == "NMF":
            # https://www.kaggle.com/remidi/dimensionality-reduction-techniques
            from sklearn.decomposition import NMF
            extraction_object = NMF(n_components=n_components,
                                    init='nndsvdar',
                                    random_state=420)
        elif technique == "F*G":
            # super fast and nice
            from sklearn.cluster import FeatureAgglomeration
            extraction_object = FeatureAgglomeration(n_clusters=n_components,
                                                     linkage='ward')
        else:
            raise ValueError("Unknown feature extraction technique: " +
                             technique)

        start_mem_measurement()
        start = time.time()

        dataset, _ = applyFeatureExtraction(
            dataset,
            predictions,
            extraction_object,
            mode,
            merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3))

        time_elapse = time.time() - start

        event = 'applying band selection method (EXTRACTION) ' + technique
        formatted_time = str(timedelta(seconds=time_elapse))
        df_column_entry_dict['Time measurement at ' + event +
                             ' [s]'] = time_elapse

        print("\n" + event + " took " + formatted_time + " seconds\n")

        event = "after applying band selection method " + technique
        stop_mem_measurement(event, df_column_entry_dict)
        print_memory_metrics(event, df_column_entry_dict)

    elif technique in data["image_compression"]["selection"]["techniques"]:

        selection_object = None
        if technique == "RandomForest":
            # Random forests or random decision forests are an ensemble learning method for classification, regression and other
            # tasks that operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.[1][2] Random decision forests correct for decision trees' habit of overfitting to their training set.[3]:587–588 https://en.wikipedia.org/wiki/Random_forest
            from sklearn.ensemble import RandomForestClassifier
            selection_object = RandomForestClassifier()
        elif technique == "LogisticRegression":
            from sklearn.linear_model import LogisticRegression
            selection_object = LogisticRegression()
        elif technique == "LinearRegression":
            from sklearn.linear_model import LinearRegression
            selection_object = LinearRegression()
        elif technique == "LightGBM":
            from lightgbm import LGBMClassifier
            selection_object = LGBMClassifier()
        else:
            raise ValueError("Unknown feature selection technique: " +
                             technique)

        start_mem_measurement()
        start = time.time()

        dataset, _ = applyFeatureSelection(
            dataset,
            predictions,
            selection_object,
            n_components,
            mode,
            merged=(len(dataset.shape) == 4 and len(predictions.shape) == 3))

        time_elapse = time.time() - start

        event = 'applying band selection method (SELECTION) ' + technique
        formatted_time = str(timedelta(seconds=time_elapse))
        df_column_entry_dict['Time measurement at ' + event +
                             ' [s]'] = time_elapse

        print("\n" + event + " took " + formatted_time + " seconds\n")

        event = "after applying band selection method " + technique
        stop_mem_measurement(event, df_column_entry_dict)
        print_memory_metrics(event, df_column_entry_dict)

    print("Dataset new shape: " + str(dataset.shape))

    return dataset
示例#25
0
X, y = data.iloc[:, :-1], data.iloc[:, -1]
X.columns = colnames[:len(colnames) - 1]

print johnson_lindenstrauss_min_dim(4601, eps=0.1)

split = train_test_split(X, y, test_size=0.3, random_state=42)
(trainData, testData, trainTarget, testTarget) = split
accuracies = []
components = np.int32(np.linspace(2, 56, 14))
model = LinearSVC()
model.fit(trainData, trainTarget)
baseline = metrics.accuracy_score(model.predict(testData), testTarget)
# loop over the projection sizes
for comp in components:
    # create the random projection
    sp = SparseRandomProjection(n_components=comp)
    X = sp.fit_transform(trainData)

    # train a classifier on the sparse random projection
    model = LinearSVC()
    model.fit(X, trainTarget)

    # evaluate the model and update the list of accuracies
    test = sp.transform(testData)
    accuracies.append(metrics.accuracy_score(model.predict(test), testTarget))

# create the figure
plt.figure()
plt.suptitle("Accuracy of Sparse Projection on Spam")
plt.xlabel("# of Components")
plt.ylabel("Accuracy")
示例#26
0
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

df = unpickle("../processed/v003/acsf_feat.pkl")

SEED = 71
N_COMP = 5
num_clusters2 = 5

fa = FactorAnalysis(n_components=N_COMP, )
pca = PCA(n_components=N_COMP, random_state=SEED)
tsvd = TruncatedSVD(n_components=N_COMP, random_state=SEED)
ica = FastICA(n_components=N_COMP, random_state=SEED)
grp = GaussianRandomProjection(n_components=N_COMP, eps=0.1, random_state=SEED)
srp = SparseRandomProjection(n_components=N_COMP,
                             dense_output=True,
                             random_state=SEED)
mbkm = MiniBatchKMeans(n_clusters=num_clusters2, random_state=SEED)
tsne = TSNE(n_components=3, random_state=SEED)

ss = StandardScaler()
df_ss = pd.DataFrame(ss.fit_transform(df.iloc[:, 2:]), columns=df.columns[2:])

decomp_cols = []
comp_results = []
comp_names = ["fa", "pca", "tsvd", "ica", "grp", "srp",
              "mbkm"]  #, "tsne"] # removing tsne
for name, transform in zip(comp_names,
                           [fa, pca, tsvd, ica, grp, srp, mbkm, tsne]):
    print(current_time(), "{} converting...".format(name), flush=True)
    n_components = N_COMP
def main():

    out = './BASE/'
    cmap = cm.get_cmap('Spectral')

    np.random.seed(0)
    letter = pd.read_hdf('./BASE/datasets.hdf', 'letter')
    letterX = letter.drop('Class', 1).copy().values
    letterY = letter['Class'].copy().values

    madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    madelonX = StandardScaler().fit_transform(madelonX)
    letterX = StandardScaler().fit_transform(letterX)

    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims2 = [2, 4, 6, 8, 10, 12, 14, 16]
    #raise
    #%% data for 1

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(madelonX)
        tmp[dim][i] = reconstructionError(rp, madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(letterX)
        tmp[dim][i] = reconstructionError(rp, letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree2.csv')

    #%% Data for 2

    grid = {
        'rp__n_components': dims,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelonX, madelonY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon dim red.csv')

    grid = {
        'rp__n_components': dims2,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(letterX, letterY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'letter dim red.csv')
    #raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 60
    rp = SparseRandomProjection(n_components=dim, random_state=5)

    madelonX2 = rp.fit_transform(madelonX)
    madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T)))
    cols = list(range(madelon2.shape[1]))
    cols[-1] = 'Class'
    madelon2.columns = cols
    madelon2.to_hdf(out + 'datasets.hdf',
                    'madelon',
                    complib='blosc',
                    complevel=9)
    #
    dim = 16
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    letterX2 = rp.fit_transform(letterX)
    letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T)))
    cols = list(range(letter2.shape[1]))
    cols[-1] = 'Class'
    letter2.columns = cols
    letter2.to_hdf(out + 'datasets.hdf',
                   'letter',
                   complib='blosc',
                   complevel=9)
示例#28
0
# In[4]:


from sklearn.decomposition import FactorAnalysis

fa = FactorAnalysis(n_components=100, random_state=42)
X_fa = fa.fit_transform(X)


# In[5]:


from sklearn.random_projection import SparseRandomProjection

srp = SparseRandomProjection(n_components=100, random_state=42)
X_srp = srp.fit_transform(X)


# In[6]:


from sklearn.random_projection import GaussianRandomProjection

grp = GaussianRandomProjection(n_components=100, random_state=42)
X_grp = grp.fit_transform(X)


# In[7]:

示例#29
0
import matplotlib.pyplot as plt

# Upload and clean our data
data = pd.read_csv('avocado.csv', index_col = 'Date')
data = data[data.columns[:-2]]
data['type'] = data['type'].replace({'conventional':0, 'organic':1})
print(data.head())

# Fit PCA and apply transformation
X = data.iloc[:,:-1].values
pca = PCA(n_components=2).fit(X)
X_t = pca.transform(X)

'''
# Plotting PCA transformed avocado dataset
plt.scatter(X_t[:,0],X_t[:,1])
plt.show()
'''

# We can also fit random projection!
rp = SparseRandomProjection() # n_components and epsilon are chosen for us
X2 = X = np.random.rand(100, 10000) # create very large random matrix (our avocado dataset only has 10 features so no need for RP)
X_rp = rp.fit_transform(X2)
print(X_rp.shape)

'''
# Plotting RP transformed random dataset
plt.scatter(X_rp[:,0],X_rp[:,1])
plt.show()
'''
示例#30
0
from numpy.lib.function_base import _interp_dispatcher
# from skmultiflow.trees import HoeffdingTree as HT
from skmultiflow.lazy import SAMKNN
from sklearn.metrics import accuracy_score
import time, copy
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics import cohen_kappa_score
# from skmultiflow.bayes import NaiveBayes
from inc_pca import IncPCA
from rff_base import Base as RFF
from rrslvq import ReactiveRobustSoftLearningVectorQuantization as RRSLVQ
from rslvq import RSLVQ

from skmultiflow.meta import AdaptiveRandomForest as ARF

transformer = SparseRandomProjection(n_components=1000)
classes = np.arange(0, 15, 1)

res_file = 'res_pca_skipgram.txt'
f = open(res_file, 'a+')
f.write('SKIP-GRAM\n')
f.close()
data = np.load('../dataset/skip-gram-embed-w-label.npy')

# f = open('data/nasdaq_stream_wo_sentiment.csv')
# labels = []
# while 1:
#    line = f.readline()
#    if line == '': break
#    arr = np.array(line.split(','), dtype='float64')
#    labels.append(arr[1])