示例#1
0
def plot_network(n_clusters, subset_job, subset_edu, no_jobs, no_edu):
    plt.figure(figsize=(6, 8))
    job_kmeans = KMeans(n_clusters=n_clusters)
    job_predict = job_kmeans.fit_predict(subset_job)
    empl_edu_kmean = KMeans(n_clusters=n_clusters)
    empl_predict = empl_edu_kmean.fit_predict(subset_edu)

    cluster_sum_jobs, cluster_sum_employ_edu = [], []

    for i in range(n_clusters):
        cluster_sum_employ_edu.append(
            sum_cluster(empl_predict, i, no_edu) / sum(no_edu))
        cluster_sum_jobs.append(
            sum_cluster(job_predict, i, no_jobs) / sum(no_jobs))

    jobs_centres = job_kmeans.cluster_centers_
    emp_edu_centres = empl_edu_kmean.cluster_centers_
    result, all_coords = min_span_tree(jobs_centres, emp_edu_centres,
                                       cluster_sum_jobs,
                                       cluster_sum_employ_edu)
    city_labels()
    plot_california_counties()
    plot_california()
    for i in range(len(result)):
        for j in range(len(result[i])):
            if result[i][j] == 0:  # NO LINK
                continue

            plt.scatter(jobs_centres[i][0] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][0],
                        jobs_centres[i][1] if i < n_clusters else
                        emp_edu_centres[i - n_clusters][1],
                        edgecolors='b',
                        facecolors='none')
            plt.scatter(jobs_centres[j][0] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][0],
                        jobs_centres[j][1] if j < n_clusters else
                        emp_edu_centres[j - n_clusters][1],
                        edgecolors='b',
                        facecolor='none')
            plt.plot(
                (jobs_centres[i][0] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][0], jobs_centres[j][0]
                 if j < n_clusters else emp_edu_centres[j - n_clusters][0]),
                (jobs_centres[i][1] if i < n_clusters else
                 emp_edu_centres[i - n_clusters][1], jobs_centres[j][1] if
                 j < n_clusters else emp_edu_centres[j - n_clusters][1]), 'b-')

    plt.show()
示例#2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    data = np.loadtxt(args.data_points)

    if args.root is not None:
        data = np.sqrt(data)

    (k, initial_points) = get_initial_centers(args.clusters, args.start_points)

    log.info('calculate center points')
    kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False)
    predict = kmeans.fit_predict(data)

    log.info('storing results')

    if args.model:
        save_object_to_file(kmeans, args.model)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for i in xrange(predict.shape[0]):
            outfile.write(u'%d\n' % predict[i])

    if args.centroids:
        np.savetxt(args.centroids, kmeans.cluster_centers_)

    log.info('finished')
def test_KMeansConstrained_parity_digits():

    iris = datasets.load_iris()
    X = iris.data

    k = 8
    random_state = 1
    size_min, size_max = None, None  # No restrictions and so should produce same result


    clf_constrained = KMeansConstrained(
        n_clusters=k,
        size_min=size_min,
        size_max=size_max,
        random_state=random_state
    )
    y_constrained = clf_constrained.fit_predict(X)

    clf_kmeans = KMeans(
        n_clusters=k,
        random_state=random_state
    )
    y_kmeans = clf_kmeans.fit_predict(X)

    assert_array_equal(y_constrained, y_kmeans)
    assert_almost_equal(clf_constrained.cluster_centers_, clf_kmeans.cluster_centers_)
    assert_almost_equal(clf_constrained.inertia_, clf_kmeans.inertia_)
示例#4
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    data = np.loadtxt(args.data_points)

    if args.root is not None:
        data = np.sqrt(data)

    (k, initial_points) = get_initial_centers(args.clusters, args.start_points)

    log.info('calculate center points')
    kmeans = KMeans(k, initial_points, 1, args.max_iter, copy_x=False)
    predict = kmeans.fit_predict(data)

    log.info('storing results')

    if args.model:
        save_object_to_file(kmeans, args.model)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for i in range(predict.shape[0]):
            outfile.write('%d\n' % predict[i])

    if args.centroids:
        np.savetxt(args.centroids, kmeans.cluster_centers_)

    log.info('finished')
示例#5
0
def plot_employment_edu_cluster(n_clusters, no_edu, subset_edu, kmeans=None):
    if not kmeans:
        kmeans = KMeans(n_clusters=n_clusters)
    empl_predict = kmeans.fit_predict(subset_edu)
    plot_california()
    plot_california_counties()
    for i in range(n_clusters):
        mean_employment_score = np.mean(
            [no_edu[j] for j in range(len(no_edu)) if empl_predict[j] == i])
        plt.scatter([
            subset_edu[j][0]
            for j in range(len(subset_edu)) if empl_predict[j] == i
        ], [
            subset_edu[j][1]
            for j in range(len(subset_edu)) if empl_predict[j] == i
        ],
                    label=f"Mean Employment Score:{mean_employment_score:.5f}",
                    s=4.5)
    plt.legend()
    plt.gca().set_xlabel("Longitude")
    plt.gca().set_ylabel("Latitude")
    plt.xlim((-120, -116))
    plt.ylim((33, 35))
    plt.axis('equal')
    plt.show()
示例#6
0
    def k_means_clustering(self, matrix):
        for k in range(3, 10):
            km = KMeans(n_clusters=k)
            self.cluster_number.append(
                [k, silhouette_score(matrix, km.fit_predict(matrix))])

        return self
示例#7
0
def perform_cluster(data, params):
    km = KMeans()
    km.set_params(**params)
    vectorizer = TfidfVectorizer()
    print(data[1][0])
    tfidf = vectorizer.fit_transform(data[1])
    labels = km.fit_predict(tfidf)
    result = {i: [] for i in set(labels)}
    for i, l in zip(range(len(labels)), labels):
        result[l].append(data[0][i])
    return result
def simulation(n, n_clusters, k_range, dim, runs=100):
    all_data = []
    k_low, k_hi = k_range
    for idx in range(runs):
        data, labels = make_blobs(n_samples=n,
                                  n_features=dim,
                                  centers=n_clusters,
                                  cluster_std=0.1,
                                  center_box=(-1.0, 1.0))

        for k in range(k_low, k_hi + 1):
            # Get a model specified, fit to data, score for error, mark error as -1 if fails
            model = KMeans(n_clusters=k, random_state=0)
            labels = model.fit_predict(data)
            avg_score = silhouette_score(data, labels)
            all_data.append([n, n_clusters, k, dim, avg_score])

    df = pd.DataFrame(all_data,
                      columns=['n', 'n_clusters', 'k', 'dim', 'avg_score'])
    return df
示例#9
0
def bisection(max_k: int, data: np.ndarray) -> tree_node:
    current_k = 1
    data_centroid = np.mean(data, 0)
    root = tree_node(0, data_centroid)
    root_sse = sum_square_error(data_centroid, data)
    next_split_order = 1
    next_node_id = 1
    queue = PriorityQueue()
    queue.put((-1.0 * root_sse, root, data))

    # print(f"rootsse {root.sse}")
    while current_k < max_k:
        _, leaf_to_split, split_data = queue.get()
        # print(f"leaf_to_split sse {leaf_to_split.sse}")
        leaf_to_split.split_order = next_split_order
        next_split_order += 1
        k = KMeans(2)
        labels = np.array(k.fit_predict(split_data), dtype=np.float32)
        labels = labels.reshape([len(labels), 1])

        left_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 0])
        left_data = split_data[left_idx, :]
        left_child = tree_node(next_node_id, np.mean(left_data, 0))
        next_node_id += 1
        leaf_to_split.left_child = left_child
        queue.put((-1.0 * sum_square_error(left_child.centroid, left_data), left_child, left_data))
        # print(f"left_child sse {left_child.sse}")

        right_idx = np.asanyarray([i for i in range(split_data.shape[0]) if labels[i] == 1])
        right_data = split_data[right_idx, :]
        right_child = tree_node(next_node_id, np.mean(right_data, 0))
        next_node_id += 1
        leaf_to_split.right_child = right_child
        queue.put((-1.0 * sum_square_error(right_child.centroid, right_data), right_child, right_data))
        # print(f"right_child sse {right_child.sse}")

        current_k += 1  # it is only one leaf node more

    _assign_leaf_ids(root)

    return root
示例#10
0
def plot_job_cluster(n_clusters, no_jobs, subset, kmeans=None):
    if not kmeans:
        kmeans = KMeans(n_clusters=n_clusters)
    job_predict = kmeans.fit_predict(subset)
    plot_california_counties()
    for i in range(n_clusters):
        mean_jobs = np.mean(
            [no_jobs[j] for j in range(len(no_jobs)) if job_predict[j] == i])
        plt.scatter(
            [subset[j][0] for j in range(len(subset)) if job_predict[j] == i],
            [subset[j][1] for j in range(len(subset)) if job_predict[j] == i],
            label=f"Mean No. Jobs:{mean_jobs:.0f}",
            s=4.5)

    # city_labels()
    plt.legend()
    plt.gca().set_xlabel("Longitude")
    plt.gca().set_ylabel("Latitude")
    plt.xlim((-120, -116))
    plt.ylim((33, 35))
    plt.axis('equal')
    plt.show()
示例#11
0
def test_KMeansConstrained_parity_digits():
    iris = datasets.load_iris()
    X = iris.data

    k = 8
    random_state = 1
    size_min, size_max = None, None  # No restrictions and so should produce same result

    clf_constrained = KMeansConstrained(size_min=size_min,
                                        size_max=size_max,
                                        n_clusters=k,
                                        random_state=random_state,
                                        init='k-means++',
                                        n_init=10,
                                        max_iter=300,
                                        tol=1e-4)
    y_constrained = clf_constrained.fit_predict(X)

    # TODO: Testing scikit-learn has be set to v0.19. This is because there is a discrepancy scikit-learn v0.22 https://github.com/scikit-learn/scikit-learn/issues/16623
    clf_kmeans = KMeans(n_clusters=k,
                        random_state=random_state,
                        init='k-means++',
                        n_init=10,
                        max_iter=300,
                        tol=1e-4)
    y_kmeans = clf_kmeans.fit_predict(X)

    # Each cluster should have the same number of datapoints assigned to it
    constrained_ndp = pd.Series(y_constrained).value_counts().values
    kmeans_ndp = pd.Series(y_kmeans).value_counts().values

    assert_almost_equal(constrained_ndp, kmeans_ndp)

    # Sort the cluster coordinates (otherwise in a random order)
    constrained_cluster_centers = sort_coordinates(
        clf_constrained.cluster_centers_)
    kmean_cluster_centers = sort_coordinates(clf_kmeans.cluster_centers_)

    assert_almost_equal(constrained_cluster_centers, kmean_cluster_centers)
示例#12
0
    arraylines = fr.readlines()
    label = []
    for line in arraylines:
        label.extend(line.split(" "))

    return label


regionslabel = loadlabel("../imagelabel/0103468.regions.txt")
layerslabel = loadlabel("../imagelabel/0103468.layers.txt")
surfaceslabel = loadlabel("../imagelabel/0103468.surfaces.txt")
imgData, row, col = loadData('../slic_segment/0103468.jpg')  #加载数据

km = KMeans(n_clusters=5)
#聚类获得每个像素所属的类别
originallabel = km.fit_predict(imgData)
label = originallabel.reshape([row, col])
#创建一张新的灰度图以保存聚类后的结果
pic_new = image.new("L", (row, col))
#根据类别向图片中添加灰度值
for i in range(row):
    for j in range(col):
        pic_new.putpixel((i, j), int(256 / (label[i][j] + 1)))
pbmlabel = []
pbmlabel.append(originallabel)
result, pbmValue = computePBM(imgData, pbmlabel)
print 'pbm值为%s' % pbmValue

regionslabel = normalized_mutual_info_score(regionslabel, originallabel)
layerslabel = normalized_mutual_info_score(layerslabel, originallabel)
surfaceslabel = normalized_mutual_info_score(surfaceslabel, originallabel)
示例#13
0
def main():

    """CONFIGURATION"""
    num_clusters = 5; #Number of clusters
    random = False #If true, it will randomly assign clusters to the states w/ equal prob. If false, it will actually computer the clusters.
    working_dir = "/home/jmaxk/proj/geoloc/cluster/fb1/" #The input working_dir, which has 1 file per class. Each file contains the results of the linguistic ethnography tool

    """END CONFIGURATION"""

    if random:
        saveFiles = getSaveFiles(working_dir + 'results/random')
    else:
        saveFiles = getSaveFiles(working_dir + 'results/real')

    clusterFile = saveFiles[0]
    mapFile = saveFiles[1]
    featureIndeces = dict()
    classIndeces = []
    counter =0
    vecs = []


    #Turn each file into a vector to be clustered. Note
    for root, dirs, files in os.walk(working_dir):
        for f in files:
            fullpath = os.path.join(root, f)
            if os.path.splitext(fullpath)[1] == '.txt':
                with open(fullpath) as fp:
                    lines = fp.readlines()
                    vec = [0.0]*(len(lines) + 1)
                    for line in lines:
                        featVals = line.split(' ')
                        key = featVals[0]
                        val = featVals[1]
                        if not featureIndeces.has_key(key):
                            featureIndeces[key] = counter
                            counter = counter + 1
                        index = featureIndeces.get(key);
                        vec[index] = float(val)
                    vecs.append(vec)
                    abbr = os.path.basename(fullpath).split(".")[0]

                    #we only want to save actual states
                    if (us.states.lookup(abbr) != None):
                        st = (str(us.states.lookup(abbr).name))
                        classIndeces.append(st)

        #transform data into numpy array
        mylist = []
        for item in vecs:
            mylist.append(numpy.array(item))
        data = numpy.array(mylist)

        #cluster with kmeans, and save the clusters
        km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=10,
                verbose=False)
        raw_results =  km.fit_predict(data)
        results = dict(zip(classIndeces, raw_results))
        saveClusters(data,km, clusterFile) #this doesn't working_dir with random

#   save the map
    if random:
        random_results = dict()
        for key in results:
            random_results[key] = randint(0,5)
        colors = genColors(random_results)
        saveMap(random_results,colors, mapFile)
    else:
        colors = genColors(results)
        saveMap(results,colors, mapFile)
示例#14
0
### you'll want to change this line to
### for f1, f2, _ in finance_features:
### (as it's currently written, the line below assumes 2 features)
# for f1, f2 in finance_features:
#   plt.scatter( f1, f2 )
# plt.show()

for f1, f2, f3 in finance_features:  # Clustering with 3 Features
    plt.scatter(f1, f2)
plt.show()

### cluster here; create predictions of the cluster labels
### for the data and store them to a list called pred

myClassifier = KMeans(n_clusters=2)
pred = myClassifier.fit_predict(finance_features)
print("type(pred) - {}\n".format(type(pred)))
print("pred - {}\n".format(pred))

### rename the "name" parameter when you change the number of features
### so that the figure gets saved to a different file
try:
    Draw(pred,
         finance_features,
         poi,
         mark_poi=False,
         name="clusters.pdf",
         f1_name=feature_1,
         f2_name=feature_2)
except NameError:
    # print "no predictions object named pred found, no clusters to plot"
示例#15
0
    np.random.seed(seed=seed)
    X = np.random.rand(n_X, d)
    clf = KMeansConstrained(n_cluster,
                            size_min=None,
                            size_max=None,
                            init='k-means++',
                            n_init=10,
                            max_iter=300,
                            tol=1e-4,
                            verbose=False,
                            random_state=seed,
                            copy_x=True,
                            n_jobs=1)
    y = clf.fit_predict(X)
    # time = timeit('y = clf.fit_predict(X)', number=1, globals=globals())


from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd

iris = datasets.load_iris()
X = iris.data
k = 8
random_state = 1

clf_kmeans = KMeans(n_clusters=k, random_state=random_state, algorithm='full')
y = clf_kmeans.fit_predict(X)

# Count number of data points for each cluster and sort
ndp = pd.Series(y).value_counts().values