def kmeans(path, n_clusters=10): if 'creditcard' in path: X, y = load_data.load_creditcard_data(path) else: X, y = load_data.load_cancer_data(path) model = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X) cluster_labels = model.fit_predict(X) cluster_labels = cluster_labels.reshape(-1, 1) enc = OneHotEncoder() enc.fit(cluster_labels) X_predict = enc.transform(cluster_labels) X_predict = pd.DataFrame(X_predict.toarray()) output_file = path.split('.csv')[0] + '_Kmeans_part5.csv' X_predict['label'] = y X_predict.to_csv(output_file)
def rp(dataset, n_components, save_to_file=False, seed=1): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() model = GaussianRandomProjection(n_components=n_components, random_state=seed) model.fit(X) ## average log-likelihood of all samples X_fitted = model.transform(X) kurt = pd.DataFrame(X_fitted) kurt = kurt.kurt(axis=0) kurt = kurt.abs().mean() reconstruction_error = util.reconstructionError(model, X) if save_to_file: X_fitted = pd.DataFrame(X_fitted) X_fitted['label'] = y.values X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv') return kurt, reconstruction_error
def ica(dataset, n_components, save_to_file=False): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() model = FastICA(n_components=n_components, random_state=SEED).fit(X) n_samples = X.shape[0] ## average log-likelihood of all samples X_fitted = model.transform(X) X_inverse = model.inverse_transform(X_fitted) dist = np.linalg.norm(X - X_inverse) / n_samples kurt = pd.DataFrame(X_fitted) kurt = kurt.kurt(axis=0) kurt = kurt.abs().mean() if save_to_file: X_fitted = pd.DataFrame(X_fitted) X_fitted['label'] = y.values X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv') return dist, kurt
def GMM(path, n_clusters): if 'creditcard' in path: X, y = load_data.load_creditcard_data(path) else: X, y = load_data.load_cancer_data(path) model = mixture.GaussianMixture(n_components=n_clusters, random_state=SEED).fit(X) cluster_labels = model.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) y_pred = model.predict(X) cluster_acc = util.cluster_acc(y, y_pred) return { 'silh': round(silhouette_avg, 3), 'cluster_acc': cluster_acc, 'aic': model.aic(X), 'bic': model.bic(X) }
def pca(dataset, n_components, save_to_file=False): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() model = PCA(n_components=n_components, random_state=SEED).fit(X) n_samples = X.shape[0] ## average log-likelihood of all samples score = model.score(X) variance = model.explained_variance_ratio_ cumsum = np.cumsum(model.explained_variance_ratio_) X_fitted = model.transform(X) X_inverse = model.inverse_transform(X_fitted) dist = np.linalg.norm(X - X_inverse) / n_samples if save_to_file: X_fitted = pd.DataFrame(X_fitted) X_fitted['label'] = y.values X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv') return score, variance, cumsum, dist
def kmeans(path, n_clusters): if 'creditcard' in path: X, y = load_data.load_creditcard_data(path) else: X, y = load_data.load_cancer_data(path) kmeans = KMeans(n_clusters=n_clusters, random_state=SEED).fit(X) n_samples = X.shape[0] cluster_labels = kmeans.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) y_pred = kmeans.predict(X) cluster_acc = util.cluster_acc(y, y_pred) score = kmeans.score(X) return { 'inertia': round(kmeans.inertia_ / n_samples, 1), 'silh': round(silhouette_avg, 3), 'cluster_acc': cluster_acc, 'score': score }
def rfe(dataset, n_components, save_to_file=False): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() estimator = SVR(kernel="linear") model = RFE(estimator, n_features_to_select=n_components, step=1) model = model.fit(X, y) n_samples = X.shape[0] X_fitted = model.transform(X) kurt = pd.DataFrame(X_fitted) kurt = kurt.kurt(axis=0) kurt = kurt.abs().mean() X_inverse = model.inverse_transform(X_fitted) reconstruction_error = np.linalg.norm(X - X_inverse) / n_samples if save_to_file: X_fitted = pd.DataFrame(X_fitted) X_fitted['label'] = y.values X_fitted.to_csv(OUTPUT_DIR + os.sep + dataset + '.csv') return kurt, reconstruction_error
def gmm(dataset, n_components, n_init, max_iter, metric): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() model = mixture.GaussianMixture(n_components=n_components, n_init=n_init, max_iter=max_iter, random_state=SEED).fit(X) cluster_labels = model.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) y_pred = model.predict(X) cluster_acc = util.cluster_acc(y, y_pred) # return {'inertia':round(model.inertia_,1), 'silh':round(silhouette_avg,3)} return { 'silh': round(silhouette_avg, 3), 'cluster_acc': cluster_acc, 'aic': model.aic(X), 'bic': model.bic(X) }
def kmeans(dataset, n_clusters, n_init, max_iter, metric): if dataset == 'creditcard': X, y = load_data.load_creditcard_data() else: X, y = load_data.load_cancer_data() kmeans = KMeans(n_clusters=n_clusters, n_init=n_init, max_iter=max_iter, random_state=SEED).fit(X) n_samples = X.shape[0] cluster_labels = kmeans.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels, metric=metric) y_pred = kmeans.predict(X) cluster_acc = util.cluster_acc(y, y_pred) score = kmeans.score(X) return { 'inertia': round(kmeans.inertia_ / n_samples, 1), 'silh': round(silhouette_avg, 3), 'cluster_acc': cluster_acc, 'score': score }