Пример #1
0
def part2():
    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(cancer_x)
        tmp[dim][i] = reconstructionError(rp, cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(housing_x)
        tmp[dim][i] = reconstructionError(rp, housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')
Пример #2
0
nn_results = run_NN(dims_spam, ica, spamX, spamY)
nn_results.to_csv('./P4_Neural_Networks_Reduced/spam_ICA_nn_results.csv')

ica = FastICA(random_state=5)
nn_results = run_NN(dims_letter, ica, letterX, letterY)
nn_results.to_csv('./P4_Neural_Networks_Reduced/letter_ICA_nn_results.csv')

#%% Part 2C & 4C - Run Dimensionality Reduction Algorithm RP, Run NN with reduced dims

print(
    'Part 2C - Starting RP, pairwise distance correlation, for spam dataset...'
)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims_spam):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(spamX), spamX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_pairwise_distance_corr.csv')

print(
    'Part 2C - Starting RP, pairwise distance correlation, for letter dataset...'
)
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_letter):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(
    './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv')
Пример #3
0
madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon')
madelonX = madelon.drop('Class', 1).copy().values
madelonY = madelon['Class'].copy().values

madelonX = StandardScaler().fit_transform(madelonX)
digitsX = StandardScaler().fit_transform(digitsX)

clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
#raise
#%% data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'madelon scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(madelonX)
    tmp[dim][i] = reconstructionError(rp, madelonX)
Пример #4
0
def main():

    out = './BASE/'
    cmap = cm.get_cmap('Spectral')

    np.random.seed(0)
    letter = pd.read_hdf('./BASE/datasets.hdf', 'letter')
    letterX = letter.drop('Class', 1).copy().values
    letterY = letter['Class'].copy().values

    madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    madelonX = StandardScaler().fit_transform(madelonX)
    letterX = StandardScaler().fit_transform(letterX)

    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims2 = [2, 4, 6, 8, 10, 12, 14, 16]
    #raise
    #%% data for 1

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(madelonX)
        tmp[dim][i] = reconstructionError(rp, madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(letterX)
        tmp[dim][i] = reconstructionError(rp, letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree2.csv')

    #%% Data for 2

    grid = {
        'rp__n_components': dims,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelonX, madelonY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon dim red.csv')

    grid = {
        'rp__n_components': dims2,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(letterX, letterY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'letter dim red.csv')
    #raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 60
    rp = SparseRandomProjection(n_components=dim, random_state=5)

    madelonX2 = rp.fit_transform(madelonX)
    madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T)))
    cols = list(range(madelon2.shape[1]))
    cols[-1] = 'Class'
    madelon2.columns = cols
    madelon2.to_hdf(out + 'datasets.hdf',
                    'madelon',
                    complib='blosc',
                    complevel=9)
    #
    dim = 16
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    letterX2 = rp.fit_transform(letterX)
    letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T)))
    cols = list(range(letter2.shape[1]))
    cols[-1] = 'Class'
    letter2.columns = cols
    letter2.to_hdf(out + 'datasets.hdf',
                   'letter',
                   complib='blosc',
                   complevel=9)
Пример #5
0
wineX = StandardScaler().fit_transform(wineX)
cancerX = StandardScaler().fit_transform(cancerX)

clusters = range(2, 10)

dims_wine = range(1, 12)
dims_cancer = range(1, 10)

#raise
#%% data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
Пример #6
0
    ica.set_params(n_components=dim)
    tmp = ica.fit_transform(creditX)
    tmp = pd.DataFrame(tmp)
    tmp = tmp.kurt(axis=0)
    kurt[dim] = tmp.abs().mean()

kurt = pd.Series(kurt)
kurt.to_csv(out + 'ICA/credit_scree.csv', header=False)
#raise

# Randomized projections ========================

tmp = defaultdict(dict)
for i, dim in product(range(10), wine_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/wine_scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), credit_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(creditX), creditX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/credit_scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), wine_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
Пример #7
0
                                   sep=',',
                                   header=None)

wineX = StandardScaler().fit_transform(wineX)
digitX = StandardScaler().fit_transform(digitX)

clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
dims_wine = [i for i in range(2, 12)]

# data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digit scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
Пример #8
0
biodegY = biodeg['Class'].copy().values


biodegX = StandardScaler().fit_transform(biodegX)
digitsX= StandardScaler().fit_transform(digitsX)

clusters =  [2,5,10,15,20,25,30,35,40]
dims = [2,5,10,15,20,25,30,35,40,45,50,55,60]
dimsb = [2,5,7,10,15,20,25,30,35]
#raise
#%% data for 1

tmp = defaultdict(dict)
for i,dim in product(range(10),dimsb):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(biodegX), biodegX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'biodeg scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'digits scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dimsb):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
Пример #9
0
from matplotlib import cm
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.random_projection import SparseRandomProjection, GaussianRandomProjection
from itertools import product

out = './results/randomized_projections/'

perm_x, perm_y, housing_x, housing_y = load_data()  # perm, housing
raise Exception('Remove this line to run code')
#2
print(1)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(perm_x), perm_x)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'perm scree1.csv')

print(2)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims_big):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'housing scree1.csv')

print(3)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
contraX = StandardScaler().fit_transform(contraX)
cancerX = StandardScaler().fit_transform(cancerX)

clusters = range(2, 10)

dims_contra = range(1, 12)
dims_cancer = range(1, 10)

#raise
#%% data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_contra):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(contraX), contraX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'contra scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_contra):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(contraX)
    tmp[dim][i] = reconstructionError(rp, contraX)
Пример #11
0
bcY = bc['diagnosis'].copy().values


faultsX = StandardScaler().fit_transform(faultsX)
bcX= StandardScaler().fit_transform(bcX)

clusters =  [2,3,4,5,6,7,8,9,10]
dims = [2,3,4,5,6,7,8,9,10,12,15]
#raise


#%% data for 1
tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(faultsX), faultsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out1+'faults scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(bcX), bcX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out1+'bc scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
Пример #12
0
    return ratio


# Breast Cancer Dataset
br = pd.read_csv('./BASE/breast.csv')
brX = br.drop('Class', 1).copy().values
brY = br['Class'].copy().values
brX = StandardScaler().fit_transform(brX)

cluster_range = range(1, 11)
dims = range(1, 30)

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(brX), brX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./RP/breast_scree.csv')

ratio = calculate_accuracy(brX, brY, range(1, 30))
barplot_breast(ratio, range(1, 30))

dim = 10
rp = SparseRandomProjection(n_components=dim, random_state=5)

brX2 = rp.fit_transform(brX)
br2 = pd.DataFrame(np.hstack((brX2, np.atleast_2d(brY).T)))
cols = list(range(br2.shape[1]))
cols[-1] = 'Class'
br2.columns = cols
br2.to_csv('./RP/breast.csv')
Пример #13
0
abaloneX = abalone.drop('Class', 1).copy().values
abaloneY = abalone['Class'].copy().values

abaloneX = StandardScaler().fit_transform(abaloneX)
digitsX = StandardScaler().fit_transform(digitsX)

clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
abalone_dims = range(1, 9)
#raise
#%% data for 1

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(abaloneX), abaloneX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'abalone scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(abaloneX)
    tmp[dim][i] = reconstructionError(rp, abaloneX)
Пример #14
0
blocks_Y = blocks_balanced['Class'].copy().values
blocks_X= StandardScaler().fit_transform(blocks_X)
print blocks_X.shape

# Run RP for Loans 10 times
#clusters =  [2,5,10,15,20,25,30,35,40]
dims = [2,4,6,9,12,15,18,21,26]

tmp_distcorr = defaultdict(dict)
tmp_recnstErr = defaultdict(dict)

for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(loans_X)    
    tmp_recnstErr[dim][i] = reconstructionError(rp, loans_X)
    tmp_distcorr[dim][i] = pairwiseDistCorr(rp.transform(loans_X), loans_X)
tmp_distcorr =pd.DataFrame(tmp_distcorr).T
tmp_distcorr['mean'] = np.mean(tmp_distcorr.iloc[:,0:10], axis = 1)
tmp_distcorr['std'] = np.std(tmp_distcorr.iloc[:,0:10], axis = 1)

tmp_recnstErr =pd.DataFrame(tmp_recnstErr).T
tmp_recnstErr['mean'] = np.mean(tmp_recnstErr.iloc[:,0:10], axis = 1)
tmp_recnstErr['std'] = np.std(tmp_recnstErr.iloc[:,0:10], axis =1)

tmp_distcorr.to_csv(out+'loans_RP_distCorr.csv')
tmp_recnstErr.to_csv(out+'loans_RP_reconstrErr.csv')


#%%
# Run RP for Pageblock 10 times
dims = [2,3,4,5,6,7,8,9,10]