Пример #1
0
    def visualize_clusters(self, x, y, dataset):
        """Visualize clusters with PCA and TSNE.

            Args:
               x (ndarray): data.
               y (ndarray): true labels.
               dataset (string): dataset, WDBC or MNIST.

            Returns:
              None.
            """

        # Declare PCA and reduce data
        pca = PCA(n_components=2, random_state=self.random_seed)
        x_pca = pca.fit_transform(x)

        # Declare TSNE and reduce data
        tsne = TSNE(n_components=2, random_state=self.random_seed)
        x_tsne = tsne.fit_transform(x)

        n_classes = len(np.unique(y))  # compute number of classes
        print('\nBenchmark Model with k = n classes = {}'.format(n_classes))

        # Benchamark the model with number of clusters (k) = number of classes
        model = clone(self.model)
        model_params = self.model.get_params()
        model_params[self.name_param] = n_classes
        model.set_params(**model_params)
        clusters = model.fit_predict(x)
        self.benchmark(x, y, clusters)

        # Create dataframe for visualization
        df = pd.DataFrame(x_tsne, columns=['tsne1', 'tsne2'])
        df['pca1'] = x_pca[:, 0]
        df['pca2'] = x_pca[:, 1]
        df['y'] = y
        df['c'] = self.clusters

        # Create subplot and plot clusters with PCA and TSNE
        fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(15, 8))
        utils.plot_clusters(ax1, 'pca1', 'pca2', df, self.name)
        utils.plot_clusters(ax2, 'tsne1', 'tsne2', df, self.name)

        # Save figure
        utils.save_figure_tight('{}_{}_clusters'.format(dataset, self.name))
Пример #2
0
def main():
    VECTORIZER_TYPE = "tf-idf"
    MAX_FEATURES = 50000
    SVM_TYPE = "linear"
    C = 1.1
    X, y = utils.read_corpus(c.FAKE_CORPUS, c.TRUTH_CORPUS)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        shuffle=True,
                                                        stratify=y)
    model1, vectorizer = train_svm(X_train,
                                   y_train,
                                   VECTORIZER_TYPE,
                                   max_features=MAX_FEATURES,
                                   type=SVM_TYPE,
                                   c=C,
                                   max_iter=10000)
    evaluation.evaluate_linear(model1, vectorizer, X_test, y_test)
    utils.plot_clusters(X_test, vectorizer)
Пример #3
0
def kmeans_run_all():
    pd.set_option('expand_frame_repr', True)
    pd.set_option('max_rows', 100)
    np.set_printoptions(precision=3, floatmode='fixed')
    for fn in c.ALL:
        k = c.ks[fn]
        t = 1
        df, class_id = parse_csv(fn)
        clusters, centroids = kmeans(df, k, t)
        results = evaluate_clusters(clusters, centroids, verbose=False)
        totals = results.sum()
        totals.name = c.TOTALS
        results = results.append(totals)
        sfn = strip_file_path(fn)
        print(f'\nSummary - {sfn}')
        print(results)
        for idx, (cluster, centroid) in enumerate(zip(clusters, centroids)):
            print(f'\nCluster {idx + 1}')
            print(f'Centroid: {centroid}')
            print(cluster)
        if 2 <= clusters[0].shape[1] <= 3:
            plot_clusters([df], np.array([df.mean().values]), f'kmeans {sfn}')
            plot_clusters(clusters, centroids, f'kmeans clustered {sfn}')
Пример #4
0
    def _run_unsupervised_clustering(self, visualize: bool = False):
        num_clusters = self._detect_num_clusters()

        (_, _, _, training_messages,) = self._generate_funcs_contexts_messages(1000)

        k_means = cluster.KMeans(n_clusters=num_clusters)
        training_labels = k_means.fit_predict(training_messages)

        if visualize:
            utils.plot_clusters(training_messages, training_labels, "Messages clusters")

        # Align cluster ids with with function/message ids:
        # Generate messages for each function,
        # pair a cluster id with the function most common in it.
        (
            alignment_func_selectors,
            _,
            _,
            alignment_messages,
        ) = self._generate_funcs_contexts_messages(1000)
        alignment_func_idxs = alignment_func_selectors.argmax(dim=1)
        alignment_labels = k_means.predict(alignment_messages)

        func_counts_per_cluster = collections.defaultdict(collections.Counter)
        for i, cluster_label in enumerate(alignment_labels):
            function_idx = alignment_func_idxs[i]
            func_counts_per_cluster[cluster_label][function_idx] += 1

        cluster_label_to_func_idx = {
            cluster_label: func_counts.most_common(1)[0][0]
            for cluster_label, func_counts in func_counts_per_cluster.items()
        }

        assert len(cluster_label_to_func_idx) == num_clusters

        self.clustering_model = k_means
        self.cluster_label_to_func_idx = cluster_label_to_func_idx
Пример #5
0
    opts, args = getopt.getopt(sys.argv[1:], "f:k:t:", ["file=", "threshold="])
except getopt.GetoptError:
    print error_msg
    sys.exit(2)

input_filename = None
K = 0
threshold = 0.01

for opt, arg in opts:
    if opt in ('-f', '--file'):
        input_filename = arg
    elif opt == '-k':
        K = int(arg)
    elif opt in ('-t', '--threshold'):
        threshold = float(arg)

if input_filename is None or K == 0:
    print error_msg
    sys.exit(2)

input_points = utils.read_points(input_filename)
clusterization = lloyd_kmeans(input_points, K, threshold)

centroids = clusterization[0]
clusters = clusterization[1]

print "centroids:\n {}".format(centroids)

utils.plot_clusters(centroids, clusters)
Пример #6
0
# Task 1 Generate
easy_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [0.2, 0.2],
                          N_POINTS)

medium_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [.55, 0.55],
                            N_POINTS)

hard_y = scatter_clusters([[0.5, 0.7], [1.5, 0.7], [1, 1.7]], [.75, 0.75],
                          N_POINTS)

y_true = {
    i: [idx for idx in range(i * N_POINTS, N_POINTS + i * N_POINTS)]
    for i in range(3)
}

# Task 1 Plot
fig = plt.figure(figsize=FIG_SIZE)
easy_plot = plot_clusters(fig, easy_y, y_true)
easy_plot.savefig("plots/easy_true.pdf", bbox_inches='tight')

fig = plt.figure(figsize=FIG_SIZE)
medium_plot = plot_clusters(fig, medium_y, y_true)
medium_plot.savefig("plots/medium_true.pdf", bbox_inches='tight')

fig = plt.figure(figsize=FIG_SIZE)
hard_plot = plot_clusters(fig, hard_y, y_true)
hard_plot.savefig("plots/hard_true.pdf", bbox_inches='tight')

# plt.sow() # plots only the last problem, move up to see others
Пример #7
0
importlib.reload(utils)

# Load the data
path = os.getcwd() + "/data/"
data_train = pd.read_table(path + "EMGaussian.data", header=None, sep=" ")
data_test = pd.read_table(path + "EMGaussian.test", header=None, sep=" ")
x = data_train.values.T
xtest = data_test.values.T
xall = np.concatenate((x, xtest), axis=1)

# Run k-means
k = 4
mus, z = kmeans.iterate_kmeans(x, k, nits=100, epsilon=0.001)
# Plot clusters and centers
fig1, ax1 = plt.subplots()
utils.plot_clusters(x, mus, z, ax1)
plt.title("K-means clustering on training data")

# Compare several runs of k-means with different random initializations
centers, objectives = kmeans.compare_several_runs(x,
                                                  k,
                                                  nsims=100,
                                                  nits=100,
                                                  epsilon=0.001)
# Plot the different centers obtained
kmeans.plot_centroids(centers, k)
# Plot histogram of distorstion values
plt.hist(objectives)

# Run EM with covariance matrices proportional to the identity matrix
# Initialization with kmeans
Пример #8
0
def main():
    args = parse_arguments(sys.argv[1:])

    print("Parameters:")
    for arg_ in args.sys_args:
        print(arg_)
    print()

    # read data
    # =========

    hapt_data = data.HAPT()
    hapt_data.load_all_data()
    hapt_data.aggregate_groups()

    exp_data = hapt_data.get_train_data()
    exp_labs = hapt_data.get_train_labels()
    exp_labels_map = hapt_data.get_labels_map()
    exp_centroids_num = len(hapt_data.get_labels_map())

    if args.data == "test":
        exp_data = hapt_data.get_test_data()
        exp_labs = hapt_data.get_test_labels()
        exp_centroids_num = len(hapt_data.get_labels_map())

    if args.aggregate:
        exp_labs = hapt_data.get_aggregated_train_labels()
        exp_labels_map = hapt_data.get_aggregated_labels_map()
        exp_centroids_num = len(hapt_data.get_aggregated_labels_map())
        if args.data == "test":
            exp_labs = hapt_data.get_aggregated_test_labels()

    # Show experiment data
    # ====================

    if args.showdata:
        utils.plot_clusters(exp_data, exp_labs, exp_labels_map, True)
        return

    # evolution
    # =========

    iterations_list, scores_list, populations_list, total_time_list, log_dir_list, best_indiv_idx_list = [],[],[],[],[],[]
    best_overall = (-1, 0, 0, 0
                    )  # score, experiment, generation (iteration), individual

    for exp_i in range(args.repeat):
        iterations, scores, populations, total_time, log_dir, best_indiv_idx = evolution.run_SGA(
            args.iter_num,
            exp_data,
            exp_labs,
            args.pop_num,
            args.prob_cross,
            args.prob_mutation,
            exp_centroids_num,
            args.adapt_function,
            args.dist_measure,
            log_dir="logs",
            loggin_pref="exp {}/{}: ".format(exp_i + 1, args.repeat))
        cur_best_score = scores[best_indiv_idx[0], best_indiv_idx[1]]
        if best_overall[0] < cur_best_score:
            best_overall = (cur_best_score, exp_i, best_indiv_idx[0],
                            best_indiv_idx[1])

        iterations_list.append(iterations)
        scores_list.append(scores)
        populations_list.append(populations)
        total_time_list.append(total_time)
        log_dir_list.append(log_dir)
        best_indiv_idx_list.append(best_indiv_idx)

        # save plot
        plot_tuple = ("pop:" + str(args.pop_num), "p_c:" +
                      str(args.prob_cross), "p_m:" + str(args.prob_mutation),
                      "data size:" + str(len(exp_labs)), args.adapt_function,
                      args.dist_measure)
        utils.plot_scores(iterations,
                          scores,
                          args.adapt_function,
                          plot_tuple,
                          to_file=True,
                          out_dir=log_dir)

    # visualize
    # =========
    if 1 < args.repeat:
        plot_tuple = ("pop:" + str(args.pop_num), "p_c:" +
                      str(args.prob_cross), "p_m:" + str(args.prob_mutation),
                      "data size:" + str(len(exp_labs)), args.adapt_function,
                      args.dist_measure)
        utils.plot_avg_scores(iterations_list,
                              scores_list,
                              args.adapt_function,
                              best_indiv_idx_list,
                              plot_tuple,
                              to_file=True,
                              out_dirs=log_dir_list)
Пример #9
0

# --------------------------------
# Visualizing the data
# --------------------------------

if __name__ == '__main__':
    from utils import plot_clusters

    blobs_data, blobs_clusters = blobs(600, n_blobs=4, surplus=500)
    moons_data, moons_clusters = two_moons(600)
    point_circle_data, point_circle_clusters = point_and_circle(600)
    worst_blobs_data, worst_blobs_clusters = worst_case_blob(600, 5.0)

    print(blobs_data.shape)
    # print((blobs_clusters == 0).sum())
    # print((blobs_clusters == 1).sum())
    # print((blobs_clusters == 2).sum())
    # print((blobs_clusters == 3).sum())

    plot_clusters(blobs_data, blobs_clusters, 'blobs', show=True)
    plot_clusters(moons_data, moons_clusters, 'moons', show=False)
    plot_clusters(point_circle_data,
                  point_circle_clusters,
                  'point and circle',
                  show=False)
    plot_clusters(worst_blobs_data,
                  worst_blobs_clusters,
                  'worst case blob',
                  show=True)
Пример #10
0
                   plot_feature_importance, pr_curve,
                   print_classfication_report, read_data, write_data,
                   score_model)
from sklearn import metrics

batch_size = 128
epochs = 50

X_train, y_train, X_test = read_data()

# Feature Diagnostic

plot_feature_corr(X_train)
plot_feature_corr(np.vstack(X_test), stem='test')
plot_pca(X_train, y_train)
plot_clusters(X_train, y_train)
indices_ci = plot_feature_importance(X_train, y_train)

skf = StratifiedKFold(y_train, n_folds=4)
train_index, dev_index = next(iter(skf))

X_dev = X_train[dev_index]
y_dev = y_train[dev_index]

X_train = X_train[train_index]
y_train = y_train[train_index]

# Since GMM works well, transforming the data to alternate space
kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True)
kp = kpca.fit(X_train)
X_train = kp.transform(X_train)
Пример #11
0
def main():
    args = parse_arguments(sys.argv[1:])

    # read params
    # ===========
    # possible params:
    # iter_num, pop_num, centers_num, prob_cross, prob_mutation, data shape, labs shape,
    # adapt_function, dist_measure, log_dir, best score, best score (index), total_time

    exp_params = {}
    text_file = [f for f in os.listdir(args.path) if f.endswith(".txt")][0]
    with open(os.path.join(args.path, text_file), "r") as text_f:
        for line in text_f:
            line = line.replace("\t", "").strip().split(":")
            if len(line) == 2 and line[0] != "" and line[1] != "":
                if line[0] == "iter_num" or line[0] == "pop_num" or line[
                        0] == "centers_num":
                    exp_params[line[0].replace(" ", "_")] = int(line[1])
                elif line[0] == "prob_cross" or line[
                        0] == "prob_mutation" or line[0] == "best score":
                    exp_params[line[0].replace(" ", "_")] = float(line[1])
                elif line[0] == "data shape" or line[0] == "labs shape":
                    exp_params[line[0].replace(" ", "_")] = make_tuple(line[1])
                elif line[0] == "best score (index)":
                    #best score (index):	generation 95, individual 99
                    line[1] = line[1].strip().split(",")
                    exp_params["best_index"] = (
                        int(line[1][0].strip().split(" ")[1]),
                        int(line[1][1].strip().split(" ")[1]))
                else:
                    exp_params[line[0].replace(" ", "_")] = line[1]

    print("\nexperiment parameters were:")
    for k, v in exp_params.items():
        print("{:20}: {}".format(k, v))

    # read results
    # ============

    generations = np.load(os.path.join(args.path, "generations.npy"))
    iterations = np.load(os.path.join(args.path, "iterations.npy"))
    scores = np.load(os.path.join(args.path, "scores.npy"))

    best_centers = generations[exp_params["best_index"][0],
                               exp_params["best_index"][1]]

    print("\nobtained results are:")
    print(
        "generations (total num, pop size, centrs num, feats num): {}".format(
            generations.shape))
    print(
        "iterations (iterations num, ):                            {}".format(
            iterations.shape))
    print(
        "scores (total num, pop size):                             {}".format(
            scores.shape))
    print(
        "generations total num, iterations num and scores total num must be equal!"
    )
    print("generations pop size and scores pop size must be equal too!")

    plot_tuple = ("pop:" + str(exp_params["pop_num"]),
                  "p_c:" + str(exp_params["prob_cross"]),
                  "p_m:" + str(exp_params["prob_mutation"]),
                  "data size:" + str(len(exp_params["data_shape"])),
                  exp_params["adapt_function"], exp_params["dist_measure"],
                  "best score:" + str(exp_params["best_score"])[:9] + " at " +
                  str(exp_params["best_index"]))
    utils.plot_scores(iterations,
                      scores,
                      exp_params["adapt_function"],
                      plot_tuple,
                      not args.nooutput,
                      out_dir=args.outdir)

    # read data
    # =========
    print("reading data...")
    hapt_data = data.HAPT()
    hapt_data.load_all_data()
    hapt_data.aggregate_groups()

    test_data = hapt_data.get_test_data()
    test_labs = hapt_data.get_test_labels()
    train_data = hapt_data.get_train_data()
    train_labs = hapt_data.get_train_labels()
    labs_map = hapt_data.get_labels_map()
    if exp_params["centers_num"] == 3:
        test_labs = hapt_data.get_aggregated_test_labels()
        train_labs = hapt_data.get_aggregated_train_labels()
        labs_map = hapt_data.get_aggregated_labels_map()
    centroids_num = len(labs_map)

    assert exp_params["centers_num"] == centroids_num

    # do clusterizations
    # ==================
    print("clustering...")
    labels_names = list(labs_map.values())
    # train data
    train_clust_labs = cluster.Centroids.cluster(
        train_data, best_centers, dist_func=exp_params["dist_measure"])
    train_clust_labs = cluster.Utils.adjust_labels(train_clust_labs,
                                                   train_labs)
    train_silh = cluster.Evaluate.silhouette(train_data, train_clust_labs,
                                             exp_params["dist_measure"])
    train_silh_normalized = (train_silh + 1) / 2
    train_info_gain = cluster.Evaluate.information_gain(
        train_labs, train_clust_labs)
    mapped_train_clust_labs = [labs_map[l] for l in train_clust_labs]
    mapped_train_labs = [labs_map[l] for l in train_labs]
    train_conf_mtx = confusion_matrix(mapped_train_labs,
                                      mapped_train_clust_labs,
                                      labels=labels_names)
    print("train set\tsilh: {:.6}, silh normalized: {:.6}, info gain: {:.6}".
          format(train_silh, train_silh_normalized, train_info_gain))
    # test data
    test_clust_labs = cluster.Centroids.cluster(
        test_data, best_centers, dist_func=exp_params["dist_measure"])
    test_clust_labs = cluster.Utils.adjust_labels(test_clust_labs, test_labs)
    test_silh = cluster.Evaluate.silhouette(test_data, test_clust_labs,
                                            exp_params["dist_measure"])
    test_silh_normalized = (test_silh + 1) / 2
    test_info_gain = cluster.Evaluate.information_gain(test_labs,
                                                       test_clust_labs)
    mapped_test_clust_labs = [labs_map[l] for l in test_clust_labs]
    mapped_test_labs = [labs_map[l] for l in test_labs]
    test_conf_mtx = confusion_matrix(mapped_test_labs,
                                     mapped_test_clust_labs,
                                     labels=labels_names)
    print("test set\tsilh: {:.6}, silh normalized: {:.6}, info gain: {:.6}".
          format(test_silh, test_silh_normalized, test_info_gain))

    # Show data
    # =========
    print("creating plots...")
    # clusters
    utils.plot_clusters(train_data,
                        train_labs,
                        labs_map,
                        True,
                        out_dir=args.outdir,
                        filename="train_orig_clusters")
    utils.plot_clusters(train_data,
                        train_clust_labs,
                        labs_map,
                        True,
                        out_dir=args.outdir,
                        filename="train_obtained_clusters")
    utils.plot_clusters(test_data,
                        test_labs,
                        labs_map,
                        True,
                        out_dir=args.outdir,
                        filename="test_orig_clusters")
    utils.plot_clusters(test_data,
                        test_clust_labs,
                        labs_map,
                        True,
                        out_dir=args.outdir,
                        filename="test_obtained_clusters")

    # confusion matrices
    utils.plot_confusion_matrix(
        train_conf_mtx,
        labels_names,
        normalize=False,
        title=
        'Confusion matrix\ntrain set\n(silh: {:.6}, silh normalized: {:.6}, info gain: {:.6})'
        .format(train_silh, train_silh_normalized, train_info_gain),
        cmap=plt.cm.Blues,
        out_dir=args.outdir,
        filename="train_conf_matr_silh_info_gain")
    utils.plot_confusion_matrix(
        test_conf_mtx,
        labels_names,
        normalize=False,
        title=
        'Confusion matrix\ntest set\n(silh: {:.6}, silh normalized: {:.6}, info gain: {:.6})'
        .format(test_silh, test_silh_normalized, test_info_gain),
        cmap=plt.cm.Blues,
        out_dir=args.outdir,
        filename="test_conf_matr_silh_info_gain")
    print("inference ended")