예제 #1
0
    def kmeans(path, n_clusters=10):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        model = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X)

        cluster_labels = model.fit_predict(X)
        cluster_labels = cluster_labels.reshape(-1, 1)
        enc = OneHotEncoder()
        enc.fit(cluster_labels)
        X_predict = enc.transform(cluster_labels)
        X_predict = pd.DataFrame(X_predict.toarray())
        output_file = path.split('.csv')[0] + '_Kmeans_part5.csv'
        X_predict['label'] = y
        X_predict.to_csv(output_file)
예제 #2
0
def rp(dataset, n_components, save_to_file=False, seed=1):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()
    model = GaussianRandomProjection(n_components=n_components,
                                     random_state=seed)
    model.fit(X)
    ## average log-likelihood of all samples
    X_fitted = model.transform(X)
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    reconstruction_error = util.reconstructionError(model, X)
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return kurt, reconstruction_error
예제 #3
0
def ica(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = FastICA(n_components=n_components, random_state=SEED).fit(X)
    n_samples = X.shape[0]
    ## average log-likelihood of all samples
    X_fitted = model.transform(X)
    X_inverse = model.inverse_transform(X_fitted)
    dist = np.linalg.norm(X - X_inverse) / n_samples
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return dist, kurt
예제 #4
0
    def GMM(path, n_clusters):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        model = mixture.GaussianMixture(n_components=n_clusters,
                                        random_state=SEED).fit(X)

        cluster_labels = model.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        y_pred = model.predict(X)
        cluster_acc = util.cluster_acc(y, y_pred)

        return {
            'silh': round(silhouette_avg, 3),
            'cluster_acc': cluster_acc,
            'aic': model.aic(X),
            'bic': model.bic(X)
        }
예제 #5
0
def pca(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = PCA(n_components=n_components, random_state=SEED).fit(X)
    n_samples = X.shape[0]
    ## average log-likelihood of all samples
    score = model.score(X)
    variance = model.explained_variance_ratio_
    cumsum = np.cumsum(model.explained_variance_ratio_)
    X_fitted = model.transform(X)
    X_inverse = model.inverse_transform(X_fitted)
    dist = np.linalg.norm(X - X_inverse) / n_samples
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return score, variance, cumsum, dist
예제 #6
0
    def kmeans(path, n_clusters):
        if 'creditcard' in path:
            X, y = load_data.load_creditcard_data(path)
        else:
            X, y = load_data.load_cancer_data(path)

        kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X)
        n_samples = X.shape[0]
        cluster_labels = kmeans.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        y_pred = kmeans.predict(X)
        cluster_acc = util.cluster_acc(y, y_pred)
        score = kmeans.score(X)

        return {
            'inertia': round(kmeans.inertia_ / n_samples, 1),
            'silh': round(silhouette_avg, 3),
            'cluster_acc': cluster_acc,
            'score': score
        }
예제 #7
0
def rfe(dataset, n_components, save_to_file=False):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    estimator = SVR(kernel="linear")
    model = RFE(estimator, n_features_to_select=n_components, step=1)
    model = model.fit(X, y)

    n_samples = X.shape[0]

    X_fitted = model.transform(X)
    kurt = pd.DataFrame(X_fitted)
    kurt = kurt.kurt(axis=0)
    kurt = kurt.abs().mean()
    X_inverse = model.inverse_transform(X_fitted)
    reconstruction_error = np.linalg.norm(X - X_inverse) / n_samples
    if save_to_file:
        X_fitted = pd.DataFrame(X_fitted)
        X_fitted['label'] = y.values
        X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv')
    return kurt, reconstruction_error
예제 #8
0
def gmm(dataset, n_components, n_init, max_iter, metric):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    model = mixture.GaussianMixture(n_components=n_components,
                                    n_init=n_init,
                                    max_iter=max_iter,
                                    random_state=SEED).fit(X)

    cluster_labels = model.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
    y_pred = model.predict(X)
    cluster_acc = util.cluster_acc(y, y_pred)

    #    return {'inertia':round(model.inertia_,1), 'silh':round(silhouette_avg,3)}
    return {
        'silh': round(silhouette_avg, 3),
        'cluster_acc': cluster_acc,
        'aic': model.aic(X),
        'bic': model.bic(X)
    }
예제 #9
0
def kmeans(dataset, n_clusters, n_init, max_iter, metric):
    if dataset == 'creditcard':
        X, y = load_data.load_creditcard_data()
    else:
        X, y = load_data.load_cancer_data()

    kmeans = KMeans(n_clusters=n_clusters,
                    n_init=n_init,
                    max_iter=max_iter,
                    random_state=SEED).fit(X)
    n_samples = X.shape[0]
    cluster_labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels, metric=metric)
    y_pred = kmeans.predict(X)
    cluster_acc = util.cluster_acc(y, y_pred)
    score = kmeans.score(X)

    return {
        'inertia': round(kmeans.inertia_ / n_samples, 1),
        'silh': round(silhouette_avg, 3),
        'cluster_acc': cluster_acc,
        'score': score
    }