Пример #1
0
    def _init_params(self, x):
        """Initialize GMM parameters by K-means.

        :param x: (n_samples, n_features) features.
        :param n_components: the number of components.
        :return: Initialized GMM parameters:
            pi: (n_components,) mixing coefficients
            mean: (n_components, n_features) means
            cov: (n_components, n_features, n_features) covariances
        """
        n_samples, n_features = x.shape

        k_means = KMeans(self.n_components)
        assigned_indices = k_means.fit_predict(x)
        mean_init = k_means.centers

        pi_init = np.zeros(self.n_components)
        cov_init = np.zeros((self.n_components, n_features, n_features))
        for k in range(self.n_components):
            cond = assigned_indices == k
            d_k = x[cond] - mean_init[k]
            pi_init[k] = np.sum(cond) / n_samples
            cov_init[k] = np.dot(d_k.T, d_k) / np.sum(cond)

        return pi_init, mean_init, cov_init
Пример #2
0
def post_kmeantrain(array: str, featurename: str, orderfeature: str):
    data = pd.read_json(array)
    columnnames = featurename.split(',')
    # columnnames = ['DFA', 'violmax', 'maxpeaksqt']
    num_examples = data.shape[0]
    # Get features.
    x_train = data[[iaxis for iaxis in columnnames]].values.reshape(
        (num_examples, len(columnnames)))
    # print(x_train)
    # Set K-Means parameters.
    num_clusters = 4  # Number of clusters into which we want to split our training dataset.
    max_iterations = 50  # maximum number of training iterations.

    # Init K-Means instance.
    k_means = KMeans(x_train, num_clusters)
    # Train K-Means instance.
    (centroids, closest_centroids_ids) = k_means.train(max_iterations)
    # print(centroids)
    data_frame = pd.DataFrame(centroids,
                              columns=[iaxis for iaxis in columnnames])
    #
    dfsort = data_frame.sort_values(by=[orderfeature])
    L = [chr(i) for i in range(97, 97 + len(centroids))]
    dfsort['L'] = pd.Series(L, index=dfsort.index)
    dfreturn = dfsort.set_index('L', drop=True)
    # print(dfreturn.to_json(orient="index"))
    return dfreturn.to_json(orient="index")
Пример #3
0
    def __init__(self):
        self.kmeans = KMeans()
        super(ClusteringGui, self).__init__()
        uic.loadUi(main_interface_file, self)

        self.browse_btn = self.findChild(QPushButton, 'browse_button')
        self.browse_btn.clicked.connect(self.on_browse_click)
        self.k_selector = self.findChild(QSpinBox, 'k_val_selector')
        self.k_selector.valueChanged.connect(self.on_update_k)
        self.repetitions_selector = self.findChild(QSpinBox, 'k_repetitions_selector')
        self.repetitions_selector.valueChanged.connect(self.on_set_repetitions)
        self.run_btn = self.findChild(QPushButton, 'run_button')
        self.run_btn.clicked.connect(self.on_run_click)
        self.run_btn.setEnabled(False)
        self.step_btn = self.findChild(QPushButton, 'step_button')
        self.step_btn.clicked.connect(self.on_step_click)
        self.step_btn.setEnabled(False)
        self.elbow_btn = self.findChild(QPushButton, 'elbow_chart_button')
        self.elbow_btn.clicked.connect(self.on_show_elbow)
        self.elbow_btn.setEnabled(False)
        self.layout = self.findChild(QVBoxLayout, 'layout')
        self.dimensions_label = self.findChild(QLabel, 'dimensions_label')
        self.dimensions_label.setText("")

        self.show()
Пример #4
0
def second_cluster_k_means(_rows, _comments, _follows, _times):
    tf = TfIDf(_rows, _comments, _follows, _times)
    tf_idf_dict = tf.tf_idf()
    tf_number = tf.get_total_keywords()
    print(sorted(tf_number.items(), key=lambda d: d[1], reverse=True))
    vsm_file_name = 'second_vsm'
    vsm = BuildVsm(_rows, tf_idf_dict)
    scores = vsm.build_vsm(vsm_file_name)
    vsm_file_path = 'vsm集合\\{}\\{}.txt'.format(vsm_file_name, vsm_file_name)

    k_cluster = K_Means(_rows, _comments, _follows, _times)
    data_set = numpy.mat(load_data_set(vsm_file_path))
    cluster_centroids, cluster_assment = k_cluster.k_means(data_set, 2)

    # # 获取矩阵中所有行的第一列,并生成每条文本所属的标签
    labels = cluster_assment[:, 0]
    labels = [int(i[0]) for i in labels.tolist()]
    classify_file1(labels, 'second_vsm结果', _rows, _follows, _comments, _times,
                   scores)

    # 使用sklearn中的KMeans算法进行聚类
    data_set = numpy.mat(load_data_set(vsm_file_path))
    cluster = KMeans(init='k-means++', n_clusters=2)
    matrix = cluster.fit_predict(data_set)
    print(matrix)
    labels = list(matrix)
    classify_file1(labels, 'second_vsm结果1', rows, follows, comments, times,
                   scores)
Пример #5
0
def main():
    iris_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/' \
               'iris/iris.data'

    x_col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
    y_col_name = 'label'
    iris_df = pd.read_csv(iris_url, names=x_col_names + [y_col_name])

    x_data = np.array(iris_df[x_col_names])

    # perform k-means clustering
    k_means = KMeans(n_centers=3, init='k-means++',
                     random_state=np.random.RandomState(0))
    y_pred = k_means.fit_predict(x_data)
    centers = k_means.centers

    # plot
    plot_colors = ['r', 'g', 'b']
    for ci in range(k_means.n_centers):
        plt.scatter(x_data[y_pred == ci, 0], x_data[y_pred == ci, 1],
                    c=plot_colors[ci])

    plt.scatter(centers[:, 0], centers[:, 1], c='y', label='centers')

    plt.title('k-means example on the iris dataset')
    plt.xlabel(x_col_names[0])
    plt.ylabel(x_col_names[1])
    plt.legend()
    plt.show()
Пример #6
0
    def _init_params(self, x):
        """Initialize GMM parameters by K-means.

        :param x: (n_samples, n_features) features.
        :param n_components: the number of components.
        :return: Initialized GMM parameters:
            pi: (n_components,) mixing coefficients
            mean: (n_components, n_features) means
            cov: (n_components, n_features, n_features) covariances
        """
        n_samples, n_features = x.shape

        k_means = KMeans(self.n_components)
        assigned_indices = k_means.fit_predict(x)
        mean_init = k_means.centers

        pi_init = np.zeros(self.n_components)
        cov_init = np.zeros((self.n_components, n_features, n_features))
        for k in range(self.n_components):
            cond = assigned_indices == k
            d_k = x[cond] - mean_init[k]
            pi_init[k] = np.sum(cond) / n_samples
            cov_init[k] = np.dot(d_k.T, d_k) / np.sum(cond)

        return pi_init, mean_init, cov_init
Пример #7
0
def squared_clustering_errors(inputs, k):
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means
    assignments = map(clusterer.classify, inputs)

    return sum(
        squared_distance(input, means[cluster])
        for input, cluster in zip(inputs, assignments))
Пример #8
0
def main():
    """ Main function. """
    # parse args
    parser = argparse.ArgumentParser()
    parser.add_argument('data', help='hw4_nolabel_train.dat')
    parser.add_argument('-t',
                        '--trial',
                        type=int,
                        default=500,
                        help='experiment times (default = 500)')
    parser.add_argument(
        '-o',
        '--output_to_png',
        default=False,
        action='store_true',
        help='Output image to files. (default is display on screen)')
    args = parser.parse_args()

    # get data
    data = get_data(args.data)

    # fit
    k_list = [2, 4, 6, 8, 10]
    avg_list = []
    var_list = []
    for k in k_list:
        err_list = []
        k_means = KMeans(k)
        for _ in range(args.trial):
            k_means.fit(data)
            err_list.append(k_means.calc_err())
        err_list = np.array(err_list)
        avg_list.append(err_list.mean())
        var_list.append(err_list.var())

    # plot
    plt.scatter(k_list, avg_list)
    plt.title('Average of $E_{in}$ vs. $k$')
    plt.xlabel('$k$')
    plt.ylabel('Average of $E_{in}$')
    if args.output_to_png:
        plt.savefig('q_15')
    else:
        plt.show()
    plt.clf()

    # plot
    plt.scatter(k_list, var_list)
    plt.title('Variance of $E_{in}$ vs. $k$')
    plt.xlabel('$k$')
    plt.ylabel('Variance of $E_{in}$')
    if args.output_to_png:
        plt.savefig('q_16')
    else:
        plt.show()
    plt.clf()
Пример #9
0
def main():
    path = 'dog.jpeg'
    A = imread(path)
    A = A.astype(float) / 255.
    img_size = A.shape
    X = A.reshape(img_size[0] * img_size[1], img_size[2])

    for k in [2, 4, 8, 16]:
        algorithm = KMeans(k=k, picture=X)
        algorithm.run_k_means(max_iterations=10)
Пример #10
0
def cluster_paragraphs(paragraphs, num_clusters=2):
    word_lists = make_word_lists(paragraphs)
    word_set = make_word_set(word_lists)
    word_vectors = make_word_vectors(word_set, word_lists)

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    k_means = KMeans(num_clusters, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)
Пример #11
0
 def _initialize_params(self, data):
     km = KMeans(self.k)
     km.fit(data)
     self.dim = data.shape[-1]
     _, self.means = km.predict(data)
     self.means = np.unique(self.means, axis=0)
     self.pis = np.random.uniform(0, 1, (self.k, ))
     self.pis = self.pis / np.sum(self.pis)
     self.covariances = np.array([np.eye(self.dim)] * self.k) * 100000000
     self.gammas = np.zeros((data.shape[0], self.k))
Пример #12
0
def cluster_paragraphs(paragraphs, num_clusters=2):
    word_lists = make_word_lists(paragraphs)
    word_set = make_word_set(word_lists)
    word_vectors = make_word_vectors(word_set, word_lists)

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    k_means = KMeans(num_clusters, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)
def main():
    # prepare sample data
    centers = 3
    X, _ = make_blobs(
        n_samples=150,
        n_features=2,
        centers=centers,
        cluster_std=0.5,
        shuffle=True,
        random_state=0)

    # fit clusterings
    clusterings = [
        KMeans(
            n_clusters=3,
            init='random',
            n_init=10,
            max_iter=300,
            tol=1.0e-4,
            random_state=1),
        KMeans(
            n_clusters=3,
            init='k-means++',
            n_init=1,
            max_iter=300,
            tol=1.0e-4,
            random_state=1)
    ]
    names = ['k-means', 'k-means++']

    for clustering, name in zip(clusterings, names):
        # predict centroid of clusters and label of each data points
        y_pred = clustering.fit_predict(X)
        # plot predicted labels
        for i in range(centers):
            Xi = X[y_pred == i]
            plt.scatter(
                Xi[:, 0], Xi[:, 1],
                marker='o', edgecolor='black', label='cluster {0}'.format(i + 1))
        # plot centroids
        plt.scatter(
            clustering.cluster_centers_[:, 0], clustering.cluster_centers_[:, 1],
            marker='*', edgecolor='black', label='centroids')
        # set plot area
        plt.grid()
        plt.legend()
        plt.title(name)
        plt.tight_layout()
        plt.show()

        # show attributes
        print('inertia:{0}'.format(clustering.inertia_))
        print('iteration times:{0}'.format(clustering.n_iter_))
Пример #14
0
def kmeans_segment(img,
                   n_clusters=DEFAULT_N_CLUSTERS,
                   max_iter=K_MEANS_DEFAULT_MAX_ITER,
                   include_spatial=False,
                   visualize=False):
    n = img.shape[0]
    m = img.shape[1]

    if include_spatial:
        xx = np.arange(n)
        yy = np.arange(m)
        X, Y = np.meshgrid(yy, xx)
        img = np.concatenate((Y.reshape(n, m, 1), X.reshape(n, m, 1), img),
                             axis=2)
        print("kmeans_segment(:include_spatial) img.shape = {}".format(
            img.shape))

    # we do img.shape[-1] so we get last shape dim which in case of
    # include_spatial=True it will be 5 and in case of include_spatial=False
    # it will be # colors which is RGB = 3
    img = img.reshape(-1, img.shape[-1])  # 2D array (n*m, features_count)

    segmented_image = KMeans(n_clusters, max_iter).fit(img).reshape(n, m)

    if visualize:
        plt.figure(figsize=(12, 12))
        plt.axis('off')
        plt.imshow(segmented_image)

    return segmented_image
Пример #15
0
def main():
    k = 3
    data = [
        1.5, 9.5, 5.4, 1.6, 5.5, 9.3, 1.7, 9.1, 1.3, 2.0, 5.0, 7.0, 7.7, 8.0
    ]
    data = zip(data, data)
    KMeans(k, data).cluster()
Пример #16
0
 def run_test(self):
     test = KMeans(self.k, self.num_of_iters)
     total_sse = []
     total_sum = 0
     for seed in range(self.max_seeds):
         test.run(self.points, seed)
         sse = test.compute_sse()
         total_sse.append(sse)
         total_sum += sse
     minimal_sse = min(total_sse)
     mean_sse = total_sum / 10
     maximal_sse = max(total_sse)
     return [
         f"0-{self.max_seeds - 1}", self.k, self.num_of_iters, minimal_sse,
         mean_sse, maximal_sse
     ]
Пример #17
0
def main():
    image = img.imread("sample_img1.png")
    colors = np.zeros((image.shape[0] * image.shape[1], image.shape[2]))
    for i in range(image.shape[0]):
        for j in range(image.shape[1]):
            colors[i * image.shape[1] + j] = image[i][j]
    termination_condition_threshold = 0.01
    k_array = [2, 4, 8, 16, 32, 64]
    for k in k_array:
        k_means = KMeans(colors, k)
        k_means.start(termination_condition_threshold)
        output = np.zeros((image.shape[0], image.shape[1], image.shape[2]))
        for i in range(image.shape[0]):
            for j in range(image.shape[1]):
                output[i][j] = k_means.clusters.get_center(
                    k_means.data.memberships[i * image.shape[1] + j])
        img.imsave(f"output{k}.png", output)
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set):
    # Create a boolean string, 1 = include feature, 0 = leave it out
    feature_set = [i for i in xrange(data_set.shape[1])]
    chosen_features = []
    chosen_clusters = []
    base_performance = float("-inf")
    # while there are still features to choose from...
    while len(feature_set) > 0:
        # initialize performance metrics
        best_performance = float("-inf")
        best_clusters = []
        #print "best performance = %f" % best_performance
        # Pick a feature that hasn't be chosen yet and train the model
        for feature in feature_set:
            chosen_features.append(feature)
            # Train model
            if model_type == "Kmeans":
                model = KMeans(num_of_classes)
            elif model_type == "HAC":
                model = HAC(num_of_classes)
            #print "Modeling with %s" % chosen_features
            clusters = model.cluster(data_set)
            # Calculate performance via LDA-like objective function
            current_performance = model.calculate_performance()
            #print "model performance = %f" % current_performance
            # if this combo of features beats the best performance so far
            # take note...
            if current_performance > best_performance:
                best_performance = current_performance
                best_feature = feature
                best_clusters = clusters
                #print "best performance updated to %f" % best_performance
            chosen_features.remove(feature)
        # If best noted performance beats the best performance we've seen
        # so far, add to chosen features
        if best_performance > base_performance:
            base_performance = best_performance
            feature_set.remove(best_feature)
            chosen_features.append(best_feature)
            chosen_clusters = best_clusters
            #print "base performance = %f" % base_performance
        else:
            #print "best performance = %f" % base_performance
            break
    return chosen_features, chosen_clusters
Пример #19
0
def cluster_paragraphs(paragraphs):
    word_lists = make_word_lists(paragraphs)  #二维列表
    word_lists1 = []
    for i in range(len(word_lists)):
        str1 = " ".join(word_lists[i])
        word_lists1.append(str1)
    # print "word_lists1:",word_lists1
    word_set = make_word_set(word_lists)  #所有词的集合
    vec_df = tfidf(word_lists1)
    word_vectors = make_word_vectors(word_set, word_lists)  #将每一条数据处理成一个固定长度的向量
    # print "word_vectors:",word_vectors

    paragraph_map = dict(zip(map(str, word_vectors), paragraphs))

    optimum_k = find_optimum_k(vec_df)
    k_means = KMeans(optimum_k, word_vectors)
    k_means.main_loop()
    return translator(k_means.clusters, paragraph_map)
Пример #20
0
def run_kmeans():
    list_seed = [1, 1, 1, 12, 12, 12]
    list_k = [3, 4, 5, 3, 4, 5]

    print("seed k sl1")

    for index, value_seed in enumerate(list_seed):
        k = list_k[index]
        num_iterations = 10
        input_path = "colors_dataset_ready.txt"
        random_seed = value_seed

        if k <= 1 or num_iterations <= 0:
            print('Please provide correct parameters')
            exit(1)
        if not os.path.exists(input_path):
            print('Input file does not exist')
            exit(1)

        points = load_data(input_path)
        if k >= len(points):
            print('Please set K less than size of dataset')
            exit(1)

        runner = KMeans(k, num_iterations)
        runner.run(points, random_seed)
        print(list_seed[index], end=" ")
        print(list_k[index], end=" ")
        runner.print_results()
Пример #21
0
def run_kmeans():
    # print(len(argv))
    if len(argv) < 4:
        print(
            'Not enough arguments provided. Please provide 3 arguments: K, num_iterations, path_to_input'
        )
        exit(1)
    k = int(argv[1])
    num_iterations = int(argv[2])
    input_path = argv[3]
    if len(argv) == 5:
        random_seed = int(argv[4])
    else:
        random_seed = 0

    if k <= 1 or num_iterations <= 0:
        print('Please provide correct parameters')
        exit(1)
    if not os.path.exists(input_path):
        print('Input file does not exist')
        exit(1)

    points = load_data(input_path)
    if k >= len(points):
        print('Please set K less than size of dataset')
        exit(1)

    runner = KMeans(k, num_iterations)
    runner.run(points, random_seed)
    runner.print_results()
Пример #22
0
def main(input_filepath, output_folder, k):
    """
    Receives the location of the tf-idf scores as a
    command-line Path argument.
    """
    logger = logging.getLogger(__name__)
    logger.info(
        'Training the K-Means clustering algorithm based on the TF-IDF scores')

    # Get the models/tf-idf-scores.csv file
    dataset = pd.read_csv(input_filepath)
    logger.info('Loaded data file ' + input_filepath + ' with ' +
                str(len(dataset)) + ' rows')

    # Removes the first column and formats it like a list
    x = dataset.drop(dataset.columns[0], axis=1).values
    vector_dict = generate_vector_dict(dataset)

    # Number of clusters and max. number of iterations
    km = KMeans(k=k, max_iterations=500)
    km.fit(x)
    clusters = km.get_clusters(vector_dict)

    # Based on the value of K used, change the destination filename
    filepath_list = (output_folder + MODEL_REPORT_FILENAME).rsplit('.', 1)
    output_filepath = filepath_list[0] + '-' + str(k) + '.' + filepath_list[1]

    # Calculate SSE and MSC
    sse_score = km.get_sse_score()
    logger.info('SSE Score: ' + str(sse_score))
    msc_score = km.get_msc_avg()
    logger.info('MSC Score: ' + str(msc_score))

    # Generate the results report
    generate_report(clusters, sse_score, msc_score, output_filepath)
    logger.info('Created report file on ' + output_filepath)

    # Generate / Update the results table for future plots
    if os.path.isfile(output_folder + PLOT_TABLE_FILENAME):
        # Update the existing file
        dataset = pd.read_csv(output_folder + PLOT_TABLE_FILENAME)
        dataset.set_index('K Size', inplace=True)
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    else:
        # Create and update the file
        dataset = create_plot_results_table()
        k_means_results = update_plot_results_table(dataset,
                                                    (k, sse_score, msc_score))
    k_means_results.to_csv(output_folder + PLOT_TABLE_FILENAME,
                           encoding='utf-8')
    logger.info('Updated report table on ' + output_folder +
                PLOT_TABLE_FILENAME)
Пример #23
0
def main():
    data = load_data()
    results = []
    np.random.seed(10)

    # pca_data = pca.pca(data, 2)[0]    #pca from scratch
    # pca_data = pca.pca_s(data, 2)     #pca from sk_learn library

    # code for simple run where k=2
    # k=2
    # random_centroids = np.random.randint(0, 128, k)
    # km = KMeans(k)
    # km.fit(data, random_centroids)

    for k in range(2, 11):
        random_centroids = np.random.randint(0, 128, k)
        km = KMeans(k)
        results.append(km.fit(data,
                              random_centroids))  #comment this for without pca
        # results.append(km.fit(pca_data, random_centroids))    #comment this for with pca
    plt.plot(results, list(range(2, 11)))
    # plt.show()
    plt.savefig('k_means.png')
Пример #24
0
    def __init__(self, input_file, n_bkts, vocab):
        sents = []
        sent = []
        with open(input_file) as f:
            for line in f.readlines():
                info = line.strip().split()
                if info:
                    assert (len(info) == 11), 'Illegal line: %s' % line
                    word = vocab.word2id(info[1].lower())
                    lemma = vocab.lemma2id(info[2].lower())
                    tag = vocab.tag2id(info[4])
                    head, rel = int(info[6]), vocab.rel2id(info[7])
                    syn_mask = int(info[10])
                    sent.append([word, lemma, tag, head, rel, syn_mask])
                else:
                    sents.append(sent)
                    sent = []

        len_counter = Counter()
        for sent in sents:
            len_counter[len(sent)] += 1
        self._bucket_sizes = KMeans(n_bkts, len_counter).splits
        self._buckets = [[] for i in xrange(n_bkts)]
        self._buckets_lens = [[] for i in xrange(n_bkts)]
        len2bkt = {}
        prev_size = -1
        for bkt_idx, size in enumerate(self._bucket_sizes):
            len2bkt.update(
                zip(range(prev_size + 1, size + 1),
                    [bkt_idx] * (size - prev_size)))
            prev_size = size

        self._record = []
        for sent in sents:
            bkt_idx = len2bkt[len(sent)]
            self._buckets[bkt_idx].append(sent)
            self._buckets_lens[bkt_idx].append(len(sent))
            idx = len(self._buckets[bkt_idx]) - 1
            self._record.append((bkt_idx, idx))

        for bkt_idx, (bucket,
                      size) in enumerate(zip(self._buckets,
                                             self._bucket_sizes)):
            self._buckets[bkt_idx] = np.zeros((size, len(bucket), 6),
                                              dtype=np.int32)
            self._buckets_lens[bkt_idx] = np.array(self._buckets_lens[bkt_idx])
            for idx, sent in enumerate(bucket):
                self._buckets[bkt_idx][:len(sent),
                                       idx, :] = np.array(sent, dtype=np.int32)
Пример #25
0
    def fit(self, csr):
        """Apply bisecting k-means"""

        # initialize k-means with k=2 for bisection
        kmeans = KMeans(k=2, pct_change=self.k_means_pct_change,
                        max_iter=self.k_means_max_iter)

        # initialize list of clusters with all points
        clusters = [range(0, csr.shape[0])]

        while len(clusters) < self.k:
            cluster = self.select_next_cluster(clusters)

            # bisect cluster iter times and select both clusters from split with lowest SSE
            lowest_sse = None
            best_split = None
            for i in range(self.n_iters):
                print 'Bisecting run # %d/%d, iter # %d/%d' % (len(clusters)+1,
                                                               self.k-1, i+1,
                                                               self.n_iters)

                # split cluster in two using k-means of 2
                bisection = kmeans.fit(csr, cluster)
                split = lambda data, l: [cluster[j] for j, d in enumerate(data) if d == l]
                x, y = split(bisection, 0), split(bisection, 1)

                # calculate total SSE of both clusters and store if lowest so far
                sse_total = self.sse(csr[x, :]) + self.sse(csr[y, :])
                if sse_total < lowest_sse or lowest_sse is None:
                    lowest_sse = sse_total
                    best_split = (x, y)

            # add best cluster split to list
            clusters.extend(best_split)

        return self.label_clusters(csr, clusters)
Пример #26
0
    def __init__(self, input_file, n_bkts, vocab):
        sents = []
        sent = [[Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT]]
        with open(input_file) as f:
            for line in f.readlines():
                info = line.strip().split()
                if info:
                    if info[0] == "#":
                        continue
                    assert (len(info) == 10), 'Illegal line: %s' % line
                    word, tag, head, rel = vocab.word2id(
                        info[1].lower()), vocab.tag2id(info[3]), int(
                            info[6]), vocab.rel2id(info[7])
                    sent.append([word, tag, head, rel])
                else:
                    sents.append(sent)
                    sent = [[Vocab.ROOT, Vocab.ROOT, 0, Vocab.ROOT]]

        len_counter = Counter()
        for sent in sents:
            len_counter[len(sent)] += 1
        print("start k-Mean bucketing")
        self._bucket_sizes = KMeans(n_bkts, len_counter).splits
        print("k-Mean finish")
        self._buckets = [[] for i in xrange(n_bkts)]
        len2bkt = {}
        prev_size = -1
        for bkt_idx, size in enumerate(self._bucket_sizes):
            len2bkt.update(
                zip(range(prev_size + 1, size + 1),
                    [bkt_idx] * (size - prev_size)))
            prev_size = size

        self._record = []
        for sent in sents:
            bkt_idx = len2bkt[len(sent)]
            self._buckets[bkt_idx].append(sent)
            idx = len(self._buckets[bkt_idx]) - 1
            self._record.append((bkt_idx, idx))

        for bkt_idx, (bucket,
                      size) in enumerate(zip(self._buckets,
                                             self._bucket_sizes)):
            self._buckets[bkt_idx] = np.zeros((size, len(bucket), 4),
                                              dtype=np.int32)
            for idx, sent in enumerate(bucket):
                self._buckets[bkt_idx][:len(sent),
                                       idx, :] = np.array(sent, dtype=np.int32)
Пример #27
0
def post_kmeanprict(array: str, centermodel: str, featurename: str):
    testdata = pd.read_json(array, orient='index')
    centers = pd.read_json(centermodel, orient='index')
    columnnames = featurename.split(',')
    testnumber = testdata.shape[0]
    # Get features.
    test_train = testdata[[iaxis for iaxis in columnnames]].values.reshape(
        (testnumber, len(columnnames)))
    centeridmodel = centers[[iaxis for iaxis in columnnames]].values.reshape(
        (len(centers), len(columnnames)))
    closest_centroids_ids = KMeans.centroids_find_closest(
        test_train, centeridmodel)
    tag = []
    for i in closest_centroids_ids:
        tag.append(centers.index[int(i[0])])

    testdata['tag'] = pd.Series(tag, index=testdata.index)

    return testdata.to_json(orient="index")
def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set):
    '''This method uses the inputted feature subset to cluster the inputted data and
    scores performance using a LDA-like objective function.'''
    # Convert candidate_feature_set representation from
    # f_1, ... f_d to the list of indices of the f_i = 1
    # (for example, [1 0 0 1 0] -> [0 3]
    candidate_feature_set = \
        [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1]
    if model_type == "Kmeans":
        model = KMeans(num_of_classes)
    elif model_type == "HAC":
        model = HAC(num_of_classes)
    model.cluster(data_set[:,candidate_feature_set])
    return model.calculate_performance() 
Пример #29
0
# Datasets to test
tests = [('data_sets/original/glass_data.txt', 7),
         ('data_sets/original/iris_data.txt', 3),
         ('data_sets/original/spam_data.txt', 2)]

for test in tests:
   data_instances = []
   data_file = open(test[0])
   print "Running with %s" % test[0]
   for line in data_file:
       line_split = line.split(',')
       data_instances.append(map(float, line_split))
   data_instances = np.array(data_instances)

   # Run SFS using k-means and HAC
   kmeans_model = KMeans(test[1])
   hac_model = HAC(test[1])

   # Glass dataset
   if "glass" in test[0]:
      kmeans_sfs_glass = np.array([1,3])
      kmeans_model.cluster(data_instances[:,kmeans_sfs_glass])
      print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance()

      kmeans_ga_glass = np.array([0,1,2,3,4,5,6])
      kmeans_model = KMeans(test[1])
      kmeans_model.cluster(data_instances[:,kmeans_ga_glass])
      print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance()

      hac_sfs_glass = np.array([0])
      hac_model.cluster(data_instances[:,hac_sfs_glass])
Пример #30
0
import numpy as np
import matplotlib.pyplot as plt
from k_means import KMeans

k_means = KMeans(2)
X = np.loadtxt("realdata.txt")[:, 1:]
k_means.fit(X)
labels = k_means.labels_

plt.xlabel('Length')
plt.ylabel('Width')
handles = []
s1 = plt.scatter(X[labels == 0, 0],
                 X[labels == 0, 1],
                 color='r',
                 label="Cluter1",
                 marker='o')
handles.append(s1)
s2 = plt.scatter(X[labels == 1, 0],
                 X[labels == 1, 1],
                 color='k',
                 label="Cluter2",
                 marker='^')
handles.append(s2)

plt.legend(handles=handles)
plt.title('K-means')
plt.show()
Пример #31
0
import numpy as np
from k_means import KMeans
from distance import euclidean
from mean import mean
import pickle

DATA_PATH = 'D:\datasets\mnist\large_dataset\mnist_train.csv'

print('Loading Data')
f = open(DATA_PATH, 'r')
data_list = []
for line in f.readlines():
    observation = np.asfarray(line.split(',')[1:])
    data_list.append(observation / 255)
f.close()
print('Finished Loading')

print('Fitting Started')
model = KMeans()
clusters = model.fit(data_list, 10, euclidean, mean)
print('Fitting Finished')

print('Saving Clusters to ./clusters.pkl')
f = open('./clusters.pkl', 'wb')
pickle.dump(clusters, f, protocol=pickle.HIGHEST_PROTOCOL)
f.close()
# GIS filters. This table should have central_area = [usable area in square meters].

# read in cell-level data
# table contains site_id, grid_id, i, j, central_area, net_pv_capacity.
with open('cell_central_pv_capacity_original.csv') as csvfile:
    data = list(csv.DictReader(csvfile))
    x, y, area = np.array(list((r["i"], r["j"], r["central_area"]) for r in data), dtype=float).T

# data = csv_to_dict('cell_central_pv_capacity_original.csv')
# i = np.array(data["i"], dtype=float)
# j = np.array(data["j"], dtype=float)
# area = np.array(data["central_area"], dtype=float)

# cluster the cells into 150 projects (somewhat arbitrarily) instead of ~750,
# and use the cluster numbers as new site_id's.
km = KMeans(150, np.c_[x, y], size=0.0001*area)
km.init_centers()
km.find_centers()
# km.plot()
for i in range(len(x)):
    # km.cluster_id is an array of cluster id's, same length as x and y
    data[i]["cluster_id"] = km.cluster_id[i]

# insert the modified data into the database
# note: it is reportedly faster to construct a single 
# insert query with all the values using python's string
# construction operators, since executemany runs numerous 
# separate inserts. However, it's considered more secure to use 
# the database library's template substitution, so we do that.
executemany("""
    INSERT INTO cell_central_pv_capacity
Пример #33
0
class TestKMeans(unittest.TestCase):
    def setUp(self):
        self._kmeans = KMeans(2) # n_clusters

    def test_get_intial_centroids(self):
        data = np.array([[1, 1], [0, 0], [-1, -1], [2, 2]])
        data = self._kmeans._get_initial_centroids(data)
        # random seed chooses the same centroids
        exp_data = np.array([[0, 0], [2, 2]])
        self.assertTrue(np.array_equal(data, exp_data))

        cent1, cent2 = [cluster.centroid for cluster in self._kmeans.clusters]
        exp_cent1 = np.array([1, 1])
        exp_cent2 = np.array([-1, -1])

        self.assertTrue(np.array_equal(cent1, exp_cent1))
        self.assertTrue(np.array_equal(cent2, exp_cent2))

    def test_choose_cluster(self):
        self._kmeans.clusters.append(self._kmeans.Cluster
                (np.array([1, 1]), initial=True))
        self._kmeans.clusters.append(self._kmeans.Cluster
                (np.array([-1, -1]), initial=True))
        self._kmeans._choose_cluster(np.array([1, 0]))
        self._kmeans._choose_cluster(np.array([-1, -2]))
 
        data_points1 = self._kmeans.clusters[0].data_points
        exp1 = np.array([[1, 1], [1, 0]])
        data_points2 = self._kmeans.clusters[1].data_points
        exp2 = np.array([[-1, -1], [-1, -2]])

        self.assertTrue(np.array_equal(data_points1, exp1))
        self.assertTrue(np.array_equal(data_points2, exp2))

    def test_squared_euclidian_dist(self):
        x1, y1 = (0, 0) #0
        x2, y2 = (np.array([1, 2, 3]), np.array([3, 2, 1])) #8
        x3, y3 = (np.array([[1, 2], [1, 2]])), np.array([[1, 1], [1, 1]])

        exp1 = 0
        res1 = self._kmeans._squared_euclidian_dist(x1, y1)
        exp2 = 8
        res2 = self._kmeans._squared_euclidian_dist(x2, y2)
        exp3 = 2
        res3 = self._kmeans._squared_euclidian_dist(x3, y3)

        self.assertEqual(res1, exp1)
        self.assertEqual(res2, exp2)
        self.assertEqual(res3, exp3)

    def test_is_finish_success(self):
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1])))
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1])))
        self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]])
        self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]])
        self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters)

        res = self._kmeans._is_finish()
        exp = 1
        self.assertEqual(res, exp)

    def test_is_finish_fail(self):
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1])))
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1])))
        self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]])
        self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]])
        self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters)
        self._kmeans.clusters[0].data_points = np.array([[2, 1], [1, 0]])

        res = self._kmeans._is_finish()
        exp = 0
        self.assertEqual(res, exp)

    def test_update_centroids_and_data(self):
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([1, 1])))
        self._kmeans.clusters.append(self._kmeans.Cluster(np.array([-1, -1])))
        self._kmeans.clusters[0].data_points = np.array([[1, 1], [1, 0]])
        self._kmeans.clusters[1].data_points = np.array([[1, 1], [-1, -2]])
        self._kmeans.prev_clusters = deepcopy(self._kmeans.clusters)

        data = self._kmeans._update_centroids_and_data()
        res_centroid1 = self._kmeans.clusters[0].centroid
        res_centroid2 = self._kmeans.clusters[1].centroid
        exp_centroid1 = np.array([1., 0.5])
        exp_centroid2 = np.array([0., -0.5])
        exp_data = np.array([[1, 1], [1, 0], [1, 1], [-1, -2]])

        self.assertTrue(np.array_equal(res_centroid1, exp_centroid1))
        self.assertTrue(np.array_equal(res_centroid2, exp_centroid2))
        self.assertTrue(np.array_equal(data, exp_data))
# graphing imports!
import matplotlib.pyplot as plt
import matplotlib.colors as colors

# clustering
from csvReader import CSVReader
from k_means import KMeans

inputFile = "microarraydata.csv"
k = 4


csvReader = CSVReader()
microarrayData = csvReader.read(inputFile)
print microarrayData

kmeans = KMeans(verbose=True)
finalClusters = kmeans.kmeans(microarrayData, k)

print "\nFinal set of gene clusters:"
for clusterIdx, cluster in enumerate(finalClusters):
    print "\tCluster %d: %s" % (clusterIdx + 1, ["gene" + str(idx + 1) for gene, idx in cluster])
print ""
Пример #35
0
#!/usr/bin/env python3

import sys
sys.path.append('code')

import numpy as np

from k_means import KMeans

# Pokemon heigh/weight
data = np.array([[0.4, 6.0],  # Pikachu
                 [0.7, 6.9],  # Bulbasaur
                 [0.6, 8.5],  # Charmander
                 [0.5, 9.0],  # Squirtle
                 [1.2, 36.0], # Slowpoke
                 [1.6, 78.5], # Slowbro
                 [1.1, 90.0], # Seel
                 [1.7, 120.0],# Dewgong
                 [2.2, 210.0],# Dragonite
                 [1.7, 55.4], # Articuno
                 [1.6, 52.6], # Zapdos
                 [2.0, 60.0]] # Moltres
                 )
if __name__ == "__main__":
    k_means = KMeans(2)
    k_means.train(data)
    k_means.report()
    # weight = np.sqrt(traj_weight/traj_weight.max())
    # for i, traj in enumerate(oil_price_traj):
    #     # plot each row as a separate series, with appropriate width and alpha
    # #     plt.semilogy(periods, traj, 'k-', linewidth=5*traj_weight[i]/traj_weight.mean(), alpha=.1)
    #     plt.semilogy(periods, traj, 'k-', linewidth=10*weight[i], alpha=weight[i])
    # plt.show()

    # mu, cluster_id = scipy.cluster.vq.kmeans2(
    #     data=np.hstack([oil_prices.T, gas_prices.T]),
    #     k=125,
    #     minit='points'
    # )

    # get a better starting point than scipy kmeans usually provides
    km = KMeans(125, np.hstack([oil_prices.T, gas_prices.T]))
    km.init_centers()   # takes about 60 s for 100,000; roughly linear in #
    mu, cluster_id = scipy.cluster.vq.kmeans2(
        data=np.hstack([oil_prices.T, gas_prices.T]), 
        k=km.mu
    )

    for var in save_vars:
        f = os.path.join(pha_dir, var + '.npy')
        np.save(f, locals()[var])

# process the scenario data
oil_price_traj = mu[:,:len(periods)]    # first half of mu
gas_price_traj = mu[:,-len(periods):]   # second half of mu
traj_weight = np.bincount(cluster_id)/cluster_id.shape[0]
# print traj_weight
Пример #37
0
from k_means import KMeans
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt

# =================== K-Means Clustering ======================
data = sio.loadmat('data\\ex7data2.mat')

K = 3
num_iters = 10
X = data['X']
initial_centroids = np.matrix([[3,3],
                               [6,2],
                               [8,5]])

kmeans = KMeans(K, num_iters)

idx = kmeans.findClosestCentroids(X, initial_centroids)

kmeans.train_model(X, initial_centroids, True)

# ============= K-Means Clustering on Pixels ===============
data = sio.loadmat('data\\bird_small.mat')

A = data['A']

A = A / 255
m, n, _ = A.shape
X = A.reshape([-1, 3])

K = 16
Пример #38
0
    print("The Final Selected Features are: (features are zero indexed) ")
    print("{}\n".format(selected_features))
    print("The Fisher Score for the clustering is: ")
    print("{}\n".format(best_features["evaluation"]))

    pp = pprint.PrettyPrinter(indent=2, width=400)
    print(
        "For Clustered points, the key in the dictionary represents the cluster each data point belongs to. "
    )
    print("Clustered points: ")
    pp.pprint(full_clusters)


# KMeans experiments
sys.stdout = open('results/GA-Kmeans-iris-results.txt', 'w')
run_ga_kmeans_experiment("data/iris.data.txt", 3, KMeans(3))

sys.stdout = open('results/GA-Kmeans-glass-results.txt', 'w')
run_ga_kmeans_experiment("data/glass.data.txt", 6, KMeans(6))

sys.stdout = open('results/GA-Kmeans-spambase-results.txt', 'w')
run_ga_kmeans_experiment("data/spambase.data.txt",
                         2,
                         KMeans(2),
                         fraction_of_data_used=100)

# HAC experiments
sys.stdout = open('results/GA-HAC-iris-results.txt', 'w')
run_hac_experiment("data/iris.data.txt", 3, HAC(3))

sys.stdout = open('results/GA-HAC-glass-results.txt', 'w')
Пример #39
0
 def setUp(self):
     self._kmeans = KMeans(2) # n_clusters