示例#1
0
tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/letter_original.csv')

algo_name.append('original')

# Run NN on dimensionally reduced and original datasets with addition cluster dimension
for i in range(len(algo_name)):
    #for i in range(4,5):
    # load datasets
    letter = pd.read_hdf('datasets.hdf', 'letter_' + algo_name[i])
    letterX = letter.drop('Y', 1).copy().values
    letterY = letter['Y'].copy().values
    le = preprocessing.LabelEncoder()
    letterY = le.fit_transform(letterY)

    km = kmeans(random_state=5)
    gmm = myGMM(random_state=5)

    grid = {
        'addClustKM__n_clusters': clusters,
        'NN__learning_rate_init': nn_lr,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(max_iter=2000, early_stopping=True, random_state=5)
    pipe = Pipeline([('addClustKM', appendClusterDimKM(cluster_algo=km)),
                     ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(letterX, letterY)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv('./P5_Neural_Networks_Reduced_With_Clusters/letter_km_' +
               algo_name[i] + '.csv')
示例#2
0
def main_logic():
    out = './BASE/'
    # change the below value based on the readme.txt file instructions
    base = './BASE/'
    np.random.seed(0)

    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values

    character = pd.read_hdf(base + 'datasets.hdf', 'character')
    character_X = character.drop('Class', 1).copy().values
    character_Y = character['Class'].copy().values

    np.random.seed(0)
    # clusters = [2]
    clusters = [2, 5, 10, 15, 20, 25, 30, 35, 40]
    madelon_X = StandardScaler().fit_transform(madelon_X)
    character_X = StandardScaler().fit_transform(character_X)

    # Data for 1-3
    SSE = defaultdict(dict)
    ll = defaultdict(dict)
    Silhouette_dict = defaultdict(dict)
    acc = defaultdict(lambda: defaultdict(dict))
    adjMI = defaultdict(lambda: defaultdict(dict))
    km = kmeans(random_state=5)
    gmm = GMM(random_state=5)

    for j in clusters:
        st = clock()
        km.set_params(n_clusters=j)
        gmm.set_params(n_components=j)
        km.fit(madelon_X)
        gmm.fit(madelon_X)

        SSE[j]['Madelon'] = km.score(madelon_X)
        ll[j]['Madelon'] = gmm.score(madelon_X)
        test = km.predict(madelon_X)

        acc[j]['Madelon']['Kmeans'] = cluster_acc(madelon_Y,
                                                  km.predict(madelon_X), j)
        acc[j]['Madelon']['GMM'] = cluster_acc(madelon_Y,
                                               gmm.predict(madelon_X))

        adjMI[j]['Madelon']['Kmeans'] = ami(madelon_Y, km.predict(madelon_X))
        adjMI[j]['Madelon']['GMM'] = ami(madelon_Y, gmm.predict(madelon_X))
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(madelon_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(madelon_Y, km.labels_))

        label = km.labels_
        gmmm = gmm.predict_proba(madelon_X)
        sil_coeff = silhouette_score(madelon_X, label, metric='euclidean')
        Silhouette_dict[j]['Madelon'] = sil_coeff
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

        km.fit(character_X)
        gmm.fit(character_X)
        SSE[j]['character'] = km.score(character_X)
        ll[j]['character'] = gmm.score(character_X)
        best = km.predict(character_X)
        acc[j]['character']['Kmeans'] = cluster_acc(character_Y,
                                                    km.predict(character_X), j)
        acc[j]['character']['GMM'] = cluster_acc(character_Y,
                                                 gmm.predict(character_X))
        adjMI[j]['character']['Kmeans'] = ami(character_Y,
                                              km.predict(character_X))
        adjMI[j]['character']['GMM'] = ami(character_Y,
                                           gmm.predict(character_X))
        label = km.labels_
        sil_coeff = silhouette_score(character_X, label, metric='euclidean')
        Silhouette_dict[j]['character'] = sil_coeff
        print(j, clock() - st)
        print("Homogenity Score ,{}, Kmeans,".format(j),
              hs(character_Y, km.labels_))
        print("Completeness Score ,{} ,Kmeans,".format(j),
              cs(character_Y, km.labels_))
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(
            j, sil_coeff))

    Silhouette_dict = pd.DataFrame(Silhouette_dict).to_csv(out +
                                                           'Silhouette.csv')
    SSE = (-pd.DataFrame(SSE)).T
    SSE.rename(columns=lambda x: x + ' SSE (left)', inplace=True)
    ll = pd.DataFrame(ll).T
    ll.rename(columns=lambda x: x + ' log-likelihood', inplace=True)
    acc = pd.Panel(acc)
    adjMI = pd.Panel(adjMI)

    SSE.to_csv(out + 'SSE.csv')
    ll.to_csv(out + 'logliklihood.csv')
    acc.ix[:, :, 'character'].to_csv(out + 'character_acc.csv')
    acc.ix[:, :, 'Madelon'].to_csv(out + 'Madelon acc.csv')
    adjMI.ix[:, :, 'character'].to_csv(out + 'character_adjMI.csv')
    adjMI.ix[:, :, 'Madelon'].to_csv(out + 'Madelon adjMI.csv')

    # %% NN fit data (2,3)
    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    madelon = pd.read_hdf(base + 'datasets.hdf', 'madelon')
    madelon_X = madelon.drop('Class', 1).copy().values
    madelon_Y = madelon['Class'].copy().values
    X_train, X_test, y_train, y_test = train_test_split(madelon_X,
                                                        madelon_Y,
                                                        test_size=0.3,
                                                        random_state=42)

    np.random.seed(0)

    for k in clusters:
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62),
                            verbose=0)
        km = kmeans(random_state=5, n_clusters=k)
        pipe = Pipeline([('km', km), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick

        print("Traning time , {}, k means dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, k means component".format(k), ',', tock)
        print("Accuracy Score ,  {}, kmeans Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

        grid = {'gmm__n_components': clusters}
        mlp = MLPClassifier(activation='relu',
                            max_iter=2000,
                            early_stopping=True,
                            random_state=5,
                            verbose=0,
                            alpha=10**-5,
                            hidden_layer_sizes=(62, 62))
        gmm = myGMM(random_state=43, n_components=k)
        pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
        # gs = GridSearchCV(pipe, grid, verbose=10, cv=5)
        tick = time.clock()
        pipe.fit(X_train, y_train)
        tock = time.clock() - tick
        print("Traning time , {}, gmm dataset".format(k), ',', tock)
        tick = time.clock()
        y_pred = pipe.predict(X_test)
        tock = time.clock() - tick
        print("Testing time , {}, gmm means component".format(k), ',', tock)
        print("Accuracy Score , {}, gmm means Madelon".format(k), ',',
              accuracy_score(y_test, y_pred))

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(madelon_X, madelon_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'Madelon cluster GMM.csv')

    grid = {
        'km__n_clusters': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    km = kmeans(random_state=5)
    pipe = Pipeline([('km', km), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_Kmeans.csv')

    grid = {
        'gmm__n_components': clusters,
        'NN__alpha': nn_reg,
        'NN__hidden_layer_sizes': nn_arch
    }
    mlp = MLPClassifier(activation='relu',
                        max_iter=2000,
                        early_stopping=True,
                        random_state=5)
    gmm = myGMM(random_state=5)
    pipe = Pipeline([('gmm', gmm), ('NN', mlp)])
    gs = GridSearchCV(pipe, grid, verbose=10, cv=5)

    gs.fit(character_X, character_Y)
    tmp = pd.DataFrame(gs.cv_results_)
    tmp.to_csv(out + 'character_cluster_GMM.csv')

    # %% For chart 4/5
    madelonX2D = TSNE(verbose=10, random_state=5).fit_transform(madelon_X)
    character_X2D = TSNE(verbose=10, random_state=5).fit_transform(character_X)

    madelon2D = pd.DataFrame(np.hstack(
        (madelonX2D, np.atleast_2d(madelon_Y).T)),
                             columns=['x', 'y', 'target'])
    character2D = pd.DataFrame(np.hstack(
        (character_X2D, np.atleast_2d(character_Y).T)),
                               columns=['x', 'y', 'target'])

    madelon2D.to_csv(out + 'madelon2D.csv')
    character2D.to_csv(out + 'character2D.csv')