Exemplo n.º 1
0
def part2():
    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancer_x), cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), range(1, 31)):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(cancer_x)
        tmp[dim][i] = reconstructionError(rp, cancer_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'cancer part2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims_big):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(housing_x)
        tmp[dim][i] = reconstructionError(rp, housing_x)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'housing part2.csv')
Exemplo n.º 2
0
def rp(X, problem):
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    tmp = defaultdict(dict)
    if 'Blood' in problem:
        dims = range(2, len(X[0]))
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(n_components=dim)
        print(i, dim)
        #rp.fit(X)
        #tmp[dim][i] = euclidean_distances(rp.fit_transform(X))
        tmp[dim][i] = reconstructionError(rp, X)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + problem + '_RP.csv')
Exemplo n.º 3
0
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_letter):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(
    './P2_Dimensionality_Reduction/letter_RP_pairwise_distance_corr.csv')

print('Part 2C - Starting RP, reconstruction error, for spam dataset...')
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_spam):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(spamX)
    tmp[dim][i] = reconstructionError(rp, spamX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2_Dimensionality_Reduction/spam_RP_reconstruction_error.csv')

print('Part 2C - Starting RP, reconstruction error, for letter dataset...')
tmp = defaultdict(dict)
for i, dim in product(range(1), dims_letter):
    print(dim)
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(letterX)
    tmp[dim][i] = reconstructionError(rp, letterX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv('./P2_Dimensionality_Reduction/letter_RP_reconstruction_error.csv')

# Run Neural Networks
rp = SparseRandomProjection(random_state=5)
Exemplo n.º 4
0
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'madelon scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(madelonX)
    tmp[dim][i] = reconstructionError(rp, madelonX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'madelon scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree2.csv')

#%% Data for 2

grid = {
    'rp__n_components': dims,
Exemplo n.º 5
0
def main():

    out = './BASE/'
    cmap = cm.get_cmap('Spectral')

    np.random.seed(0)
    letter = pd.read_hdf('./BASE/datasets.hdf', 'letter')
    letterX = letter.drop('Class', 1).copy().values
    letterY = letter['Class'].copy().values

    madelon = pd.read_hdf('./BASE/datasets.hdf', 'madelon')
    madelonX = madelon.drop('Class', 1).copy().values
    madelonY = madelon['Class'].copy().values

    madelonX = StandardScaler().fit_transform(madelonX)
    letterX = StandardScaler().fit_transform(letterX)

    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    dims = [2, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
    dims2 = [2, 4, 6, 8, 10, 12, 14, 16]
    #raise
    #%% data for 1

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(madelonX), madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(letterX), letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree1.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(madelonX)
        tmp[dim][i] = reconstructionError(rp, madelonX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'madelon scree2.csv')

    tmp = defaultdict(dict)
    for i, dim in product(range(10), dims2):
        rp = SparseRandomProjection(random_state=i, n_components=dim)
        rp.fit(letterX)
        tmp[dim][i] = reconstructionError(rp, letterX)
    tmp = pd.DataFrame(tmp).T
    tmp.to_csv(out + 'letter scree2.csv')

    #%% Data for 2

    grid = {
        'rp__n_components': dims,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelonX, madelonY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon dim red.csv')

    grid = {
        'rp__n_components': dims2,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    rp = SparseRandomProjection(random_state=5)
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    pipe = Pipeline([('rp', rp), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(letterX, letterY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'letter dim red.csv')
    #raise
    #%% data for 3
    # Set this from chart 2 and dump, use clustering script to finish up
    dim = 60
    rp = SparseRandomProjection(n_components=dim, random_state=5)

    madelonX2 = rp.fit_transform(madelonX)
    madelon2 = pd.DataFrame(np.hstack((madelonX2, np.atleast_2d(madelonY).T)))
    cols = list(range(madelon2.shape[1]))
    cols[-1] = 'Class'
    madelon2.columns = cols
    madelon2.to_hdf(out + 'datasets.hdf',
                    'madelon',
                    complib='blosc',
                    complevel=9)
    #
    dim = 16
    rp = SparseRandomProjection(n_components=dim, random_state=5)
    letterX2 = rp.fit_transform(letterX)
    letter2 = pd.DataFrame(np.hstack((letterX2, np.atleast_2d(letterY).T)))
    cols = list(range(letter2.shape[1]))
    cols[-1] = 'Class'
    letter2.columns = cols
    letter2.to_hdf(out + 'datasets.hdf',
                   'letter',
                   complib='blosc',
                   complevel=9)
Exemplo n.º 6
0
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(cancerX)
    tmp[dim][i] = reconstructionError(rp, cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree2.csv')

#%% Data for 2

grid = {
    'rp__n_components': dims_wine,
Exemplo n.º 7
0
                                           diamondsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'diamonds scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims2):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims1):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(diamondsX)
    tmp[dim][i] = reconstructionError(rp, diamondsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'diamonds scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims2):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree2.csv')

#%% task 4

grid = {
    'rp__n_components': dims1,
Exemplo n.º 8
0
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitX), digitX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digit scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_wine):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'wine scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitX)
    tmp[dim][i] = reconstructionError(rp, digitX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digit scree2.csv')

# Data for 2

grid = {
    'rp__n_components': dims_wine,
Exemplo n.º 9
0
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(wineX), wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/wine_scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), credit_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(creditX), creditX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/credit_scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), wine_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(wineX)
    tmp[dim][i] = reconstructionError(rp, wineX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/wine_scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), credit_dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(creditX)
    tmp[dim][i] = reconstructionError(rp, creditX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'RP/credit_scree2.csv')

# RF ====================================

rfc = RandomForestClassifier(n_estimators=100,
                             class_weight='balanced',
Exemplo n.º 10
0
tmp.to_csv(out + 'perm scree1.csv')

print(2)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims_big):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(housing_x), housing_x)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'housing scree1.csv')

print(3)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(perm_x)
    tmp[dim][i] = reconstructionError(rp, perm_x)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'perm scree2.csv')

print(4)
tmp = defaultdict(dict)
for i, dim in product(range(10), dims_big):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(housing_x)
    tmp[dim][i] = reconstructionError(rp, housing_x)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'housing scree2.csv')

#4
grid = {
    'rp__n_components': dims,
Exemplo n.º 11
0
tmp.to_csv(out+'biodeg scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'digits scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dimsb):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(biodegX)    
    tmp[dim][i] = reconstructionError(rp, biodegX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'biodeg scree2.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)  
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out+'digits scree2.csv')

#%% Data for 2

grid ={'rp__n_components':dimsb,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(contraX), contraX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'contra scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(cancerX), cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_contra):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(contraX)
    tmp[dim][i] = reconstructionError(rp, contraX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'contra scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims_cancer):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(cancerX)
    tmp[dim][i] = reconstructionError(rp, cancerX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'cancer scree2.csv')

#%% Data for 2

grid = {
    'rp__n_components': dims_contra,
Exemplo n.º 13
0
tmp.to_csv(out1+'faults scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(bcX), bcX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out1+'bc scree1.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(faultsX)    
    tmp[dim][i] = reconstructionError(rp, faultsX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out1+'faults scree2.csv')


tmp = defaultdict(dict)
for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(bcX)  
    tmp[dim][i] = reconstructionError(rp, bcX)
tmp =pd.DataFrame(tmp).T
tmp.to_csv(out1+'bc scree2.csv')


#%% Data for 2
grid ={'rp__n_components':dims,'NN__alpha':nn_reg,'NN__hidden_layer_sizes':nn_arch}
Exemplo n.º 14
0
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(abaloneX), abaloneX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'abalone scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(digitsX), digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree1.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(abaloneX)
    tmp[dim][i] = reconstructionError(rp, abaloneX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'abalone scree2.csv')

tmp = defaultdict(dict)
for i, dim in product(range(10), dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(digitsX)
    tmp[dim][i] = reconstructionError(rp, digitsX)
tmp = pd.DataFrame(tmp).T
tmp.to_csv(out + 'digits scree2.csv')

#%% Data for 2

grid = {
    'rp__n_components': abalone_dims,
Exemplo n.º 15
0
blocks_X = blocks_balanced.drop('Class',1).copy().values
blocks_Y = blocks_balanced['Class'].copy().values
blocks_X= StandardScaler().fit_transform(blocks_X)
print blocks_X.shape

# Run RP for Loans 10 times
#clusters =  [2,5,10,15,20,25,30,35,40]
dims = [2,4,6,9,12,15,18,21,26]

tmp_distcorr = defaultdict(dict)
tmp_recnstErr = defaultdict(dict)

for i,dim in product(range(10),dims):
    rp = SparseRandomProjection(random_state=i, n_components=dim)
    rp.fit(loans_X)    
    tmp_recnstErr[dim][i] = reconstructionError(rp, loans_X)
    tmp_distcorr[dim][i] = pairwiseDistCorr(rp.transform(loans_X), loans_X)
tmp_distcorr =pd.DataFrame(tmp_distcorr).T
tmp_distcorr['mean'] = np.mean(tmp_distcorr.iloc[:,0:10], axis = 1)
tmp_distcorr['std'] = np.std(tmp_distcorr.iloc[:,0:10], axis = 1)

tmp_recnstErr =pd.DataFrame(tmp_recnstErr).T
tmp_recnstErr['mean'] = np.mean(tmp_recnstErr.iloc[:,0:10], axis = 1)
tmp_recnstErr['std'] = np.std(tmp_recnstErr.iloc[:,0:10], axis =1)

tmp_distcorr.to_csv(out+'loans_RP_distCorr.csv')
tmp_recnstErr.to_csv(out+'loans_RP_reconstrErr.csv')


#%%
# Run RP for Pageblock 10 times