Exemplo n.º 1
0
def part5_nn(clustering):
    df = pd.DataFrame()
    index = 0
    history_dict = {}
    #    clusterings= ['Kmeans','EM']
    alg_list = ['PCA', 'ICA', 'RP', 'RFE']

    for alg in alg_list:
        data_file = alg + os.sep + 'creditcard_{}_part5.csv'.format(clustering)
        X, y = load_data.load_creditcard_data(data_file)
        output_file = 'creditcard_{}_part5_{}_cm.png'.format(clustering, alg)
        model, history, metrics, duration = nn(X,
                                               y,
                                               act='relu',
                                               epoch=100,
                                               output_file=output_file,
                                               title=alg)
        #        print(model.metrics_names)
        #        print(metrics)
        df.loc[index, 'alg'] = alg
        df.loc[index, 'loss'] = metrics[0]
        df.loc[index, 'acc'] = metrics[1]
        df.loc[index, 'auc'] = metrics[2]
        df.loc[index, 'duration'] = duration
        index += 1
        history_dict[alg] = history
    df = df.round(3)
    return df, history_dict
Exemplo n.º 2
0
def part4():
    df = pd.DataFrame()
    index = 0
    history_dict = {}
    alg_list = ['PCA', 'ICA', 'RP', 'RFE', 'original']
    for alg in alg_list:
        if alg == 'original':
            X, y = load_data.load_creditcard_data()
        else:
            data_file = alg + os.sep + 'creditcard.csv'
            X, y = load_data.load_creditcard_data(data_file)
        model, history, metrics, duration = nn(X, y, act='relu', epoch=100)
        #        print(model.metrics_names)
        #        print(metrics)
        df.loc[index, 'alg'] = alg
        df.loc[index, 'loss'] = metrics[0]
        df.loc[index, 'acc'] = metrics[1]
        df.loc[index, 'auc'] = metrics[2]
        df.loc[index, 'duration'] = duration
        index += 1
        history_dict[alg] = history
    df = df.round(3)
    return df, history_dict
Exemplo n.º 3
0
    def kmeans(path, n_clusters=10):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        model = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X)

        cluster_labels = model.fit_predict(X)
        cluster_labels = cluster_labels.reshape(-1, 1)
        enc = OneHotEncoder()
        enc.fit(cluster_labels)
        X_predict = enc.transform(cluster_labels)
        X_predict = pd.DataFrame(X_predict.toarray())
        output_file = path.split('.csv')[0] + '_Kmeans_part5.csv'
        X_predict['label'] = y
        X_predict.to_csv(output_file)
Exemplo n.º 4
0
def rp(dataset, n_components, save_to_file=False, seed=1):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()
    model = GaussianRandomProjection(n_components=n_components,
                                     random_state=seed)
    model.fit(X)
    ## average log-likelihood of all samples
    X_fitted = model.transform(X)
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    reconstruction_error = util.reconstructionError(model, X)
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return kurt, reconstruction_error
Exemplo n.º 5
0
def ica(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = FastICA(n_components=n_components, random_state=SEED).fit(X)
    n_samples = X.shape[0]
    ## average log-likelihood of all samples
    X_fitted = model.transform(X)
    X_inverse = model.inverse_transform(X_fitted)
    dist = np.linalg.norm(X - X_inverse) / n_samples
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return dist, kurt
Exemplo n.º 6
0
    def GMM(path, n_clusters):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        model = mixture.GaussianMixture(n_components=n_clusters,
                                        random_state=SEED).fit(X)

        cluster_labels = model.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        y_pred = model.predict(X)
        cluster_acc = util.cluster_acc(y, y_pred)

        return {
            'silh': round(silhouette_avg, 3),
            'cluster_acc': cluster_acc,
            'aic': model.aic(X),
            'bic': model.bic(X)
        }
Exemplo n.º 7
0
def pca(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = PCA(n_components=n_components, random_state=SEED).fit(X)
    n_samples = X.shape[0]
    ## average log-likelihood of all samples
    score = model.score(X)
    variance = model.explained_variance_ratio_
    cumsum = np.cumsum(model.explained_variance_ratio_)
    X_fitted = model.transform(X)
    X_inverse = model.inverse_transform(X_fitted)
    dist = np.linalg.norm(X - X_inverse) / n_samples
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return score, variance, cumsum, dist
Exemplo n.º 8
0
    def kmeans(path, n_clusters):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X)
        n_samples = X.shape[0]
        cluster_labels = kmeans.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        y_pred = kmeans.predict(X)
        cluster_acc = util.cluster_acc(y, y_pred)
        score = kmeans.score(X)

        return {
            'inertia': round(kmeans.inertia_ / n_samples, 1),
            'silh': round(silhouette_avg, 3),
            'cluster_acc': cluster_acc,
            'score': score
        }
Exemplo n.º 9
0
def rfe(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    estimator = SVR(kernel="linear")
    model = RFE(estimator, n_features_to_select=n_components, step=1)
    model = model.fit(X, y)

    n_samples = X.shape[0]

    X_fitted = model.transform(X)
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    X_inverse = model.inverse_transform(X_fitted)
    reconstruction_error = np.linalg.norm(X - X_inverse) / n_samples
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return kurt, reconstruction_error
Exemplo n.º 10
0
def gmm(dataset, n_components, n_init, max_iter, metric):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = mixture.GaussianMixture(n_components=n_components,
                                    n_init=n_init,
                                    max_iter=max_iter,
                                    random_state=SEED).fit(X)

    cluster_labels = model.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
    y_pred = model.predict(X)
    cluster_acc = util.cluster_acc(y, y_pred)

    #    return {'inertia':round(model.inertia_,1), 'silh':round(silhouette_avg,3)}
    return {
        'silh': round(silhouette_avg, 3),
        'cluster_acc': cluster_acc,
        'aic': model.aic(X),
        'bic': model.bic(X)
    }
Exemplo n.º 11
0
def kmeans(dataset, n_clusters, n_init, max_iter, metric):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    kmeans = KMeans(n_clusters=n_clusters,
                    n_init=n_init,
                    max_iter=max_iter,
                    random_state=SEED).fit(X)
    n_samples = X.shape[0]
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
    y_pred = kmeans.predict(X)
    cluster_acc = util.cluster_acc(y, y_pred)
    score = kmeans.score(X)

    return {
        'inertia': round(kmeans.inertia_ / n_samples, 1),
        'silh': round(silhouette_avg, 3),
        'cluster_acc': cluster_acc,
        'score': score
    }