示例#1
0
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 14 23:29:50 2021

@author: 김도형
"""

import pandas as pd
import matplotlib.pyplot as plt

uci_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/\
00292/Wholesale%20customers%20data.csv'
df = pd.read_csv(uci_path, header=0)

X = df.iloc[:,:]

from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)

from sklearn import cluster
kmeans = cluster.KMeans(init='k-means++', n_clusters=5, n_init=10)
kmeans.fit(X)

cluster_label=kmeans.labels_

df['Cluster'] = cluster_label

print(df)
示例#2
0
# 1. k개의 중심값을 임의로 배정한다.
# 2. 각 데이터마다 중심값까지의 거리를 계산하고 가장 가까운 중심값의 클러스터에 할당.
# 3. 클러스에서 속한 데이터의 평균값으로 중심값을 이동
# 4. 데이터에 대한 클러스터 할당이 변하지 않을 때까지 반복

from sklearn import cluster
from sklearn import datasets
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
from sklearn import metrics

iris = datasets.load_iris()
X = iris.data[:, 0:2]

kmeans = cluster.KMeans(n_clusters=3, random_state=0).fit(X)
print("Clusters: ", kmeans.labels_)

mean_squared_error(kmeans.labels_, iris.target)

X = iris.data
Y = iris.target

print(X)
print()
print(Y)

estimator = [('k=8', cluster.KMeans(n_clusters=8)),
            ('k=3', cluster.KMeans(n_clusters=3)),
            ('k=3(r)', cluster.KMeans(n_clusters=3, n_init=1, init='random'))]
示例#3
0
args = vars(ap.parse_args())

# Tell GDAL to throw Python exceptions, and register all drivers
gdal.UseExceptions()
gdal.AllRegister()

# Read in raster image
img_ds = gdal.Open(args["image"], gdal.GA_ReadOnly)

band = img_ds.GetRasterBand(4)

img = band.ReadAsArray()

X = img.reshape((-1, 1))

k_means = cluster.KMeans(n_clusters=8)
k_means.fit(X)

X_cluster = k_means.labels_
X_cluster = X_cluster.reshape(img.shape)

plt.figure(figsize=(20, 20))
plt.imshow(X_cluster, cmap="hsv")
plt.show()

# Read in raster image
img_ds = gdal.Open(args["image"], gdal.GA_ReadOnly)

img = np.zeros(
    (img_ds.RasterYSize, img_ds.RasterXSize, img_ds.RasterCount),
    gdal_array.GDALTypeCodeToNumericTypeCode(img_ds.GetRasterBand(1).DataType))
df['track'].apply(add_more_swim_data)

##

track = df.loc['7A96', 'track'].copy()

segs = track[:-1]

# variables to cluster on:
from sklearn import cluster

if 0:
    # standardize
    state = segs.loc[:, ['swim_hdg_rel', 'swim_speed', 'tnum_m']].values
    state = (state - state.mean(axis=0)) / state.std(axis=0)
    kmeans = cluster.KMeans(n_clusters=5).fit(state)
    labels = kmeans.labels_
if 1:
    # standardize
    state = segs.loc[:, ['swim_hdg_rel', 'swim_speed', 'tnum_m']].values
    state = (state - state.mean(axis=0)) / state.std(axis=0)
    spectral = cluster.SpectralClustering(n_clusters=6).fit(state)
    labels = spectral.labels_

num = 20
plt.figure(num).clf()
fig, (ax_geo, ax_swim) = plt.subplots(2, 1, num=num)

ax_geo.scatter(segs['x_m'], segs['y_m'], 20, labels, cmap='jet')
ax_swim.scatter(segs['swim_x'], segs['swim_y'], 20, labels, cmap='jet')
示例#5
0
        for n in range(nrows - 1):
            y[n][l] = float(y[n][l].strip()) / float(max)
    # print(y)
    return y, first_col


if __name__ == '__main__':
    # 读数据
    print("开始读取数据")
    X, first_col = excel()
    # 聚类
    print("开始聚类")
    n_clusters = 10

    km = cluster.KMeans(n_clusters=n_clusters,
                        init='k-means++',
                        max_iter=1,
                        n_init=1)
    km.fit(X)

    # 聚类完成,打印出每一行数据的类别
    # for i, j in enumerate(km.labels_):
    #     if(i%100 == 0):
    #         print(i, j)

    # 总共分n 类
    data = []
    for i in range(n_clusters):
        text = ""
        for j, k in enumerate(km.labels_):
            if (i == k):
                text = text + str(first_col[j + 1])
示例#6
0
csv = np.genfromtxt('output.csv', delimiter=",")[1:]

a = np.apply_along_axis(check_condition, 1, csv)
a = np.where(a == True)[0]
nonzero_rows = csv[a, :]
avg_synapse = np.mean(nonzero_rows[:, -1])
xyz_only = nonzero_rows[:, [0, 1, 2]]

if filter_less_than_avg:
    filter_avg_synapse = np.apply_along_axis(synapse_filt, 1, nonzero_rows,
                                             avg_synapse)
    a = np.where(filter_avg_synapse == True)[0]
    nonzero_filtered = nonzero_rows[a, :]
    xyz_only = nonzero_filtered[:, [0, 1, 2]]

kmeans_algo = cluster.KMeans(n_clusters=n_clusters)
clusters = kmeans_algo.fit_predict(xyz_only)
centers = kmeans_algo.cluster_centers_
print centers

# randomly sample
perm = np.random.permutation(xrange(1, len(xyz_only[:])))
xyz_only = xyz_only[perm[:samples]]
clusters = clusters[perm[:samples]]

# get range for graphing
x_min = np.amin(xyz_only[:, 0])
x_max = np.amax(xyz_only[:, 0])
y_max = np.amax(xyz_only[:, 1])
y_min = np.amin(xyz_only[:, 1])
z_min = np.amin(xyz_only[:, 2])
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt

from sklearn import cluster
from scipy.misc import face
 

face = face(gray=True)


n_clusters = 5
np.random.seed(0)

X = face.reshape((-1, 1))  # We need an (n_sample, n_feature) array
k_means = cluster.KMeans(n_clusters=n_clusters, n_init=4)
k_means.fit(X)
values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_

# create an array from labels and values
face_compressed = np.choose(labels, values)
face_compressed.shape = face.shape

vmin = face.min()
vmax = face.max()

# original face
plt.figure(1, figsize=(3, 2.2))
plt.imshow(face, cmap='gray', vmin=vmin, vmax=256)
示例#8
0
    for node in G:
        for neighbor in G.neighbors(node):
            edge_mat[images_list.index(node)][images_list.index(neighbor)] = 1
        edge_mat[images_list.index(node)][images_list.index(node)] = 1

    return edge_mat


img_img_graph = None
with open('pickles/cache/graph-k-10-20181123-165012.pkl', 'rb') as f:
    img_img_graph = pickle.load(f)
with open('pickles/pre-processed/images_list.pkl', 'rb') as f:
    images_list = pickle.load(f)
edge_matrix = graph_to_edge_matrix(img_img_graph, images_list)
k_clusters = 10
results = []
algorithms = {}
algorithms['kmeans'] = cluster.KMeans(n_clusters=k_clusters, n_init=1)
for model in algorithms.values():
    model.fit(edge_matrix)
    results.extend(model.labels_)
clusters = {}
for cluster_id in set(results):
    clusters[cluster_id] = [
        i for i, x in enumerate(results) if x == cluster_id
    ]
# [main_list[x] for x in indexes]
for cluster_id in clusters.keys():
    visualize_images("Cluster id " + str(cluster_id),
                     [images_list[x] for x in clusters[cluster_id]])
示例#9
0
"""
demo08_kmeans.py  kmeans聚类
"""
import numpy as np
import sklearn.cluster as sc
import matplotlib.pyplot as mp

x = np.loadtxt('../ml_data/multiple3.txt', delimiter=',')
# 构建聚类模型
model = sc.KMeans(n_clusters=4)
model.fit(x)
# 返回每个样本的聚类的类别标签: 0/1/2/3
pred_y = model.labels_
# 返回所有的聚类中心样本
centers = model.cluster_centers_
print(centers)
# 绘制分类边界线
n = 500
l, r = x[:, 0].min() - 1, x[:, 0].max() + 1
b, t = x[:, 1].min() - 1, x[:, 1].max() + 1
grid_x = np.meshgrid(np.linspace(l, r, n), np.linspace(b, t, n))
flat_x = np.column_stack((grid_x[0].ravel(), grid_x[1].ravel()))
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)

mp.figure('K-Means Cluster', facecolor='lightgray')
mp.title('K-Means Cluster', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
示例#10
0
def perform_clustering():
    ################################################################################################

    ##	Connect to DB and select data

    ################################################################################################

    # Connection string to connect to SQL Server named instance.
    conn_str = 'Driver=SQL Server;Server=VC5-SOPHIA;Database=tpcxbb_1gb;Trusted_Connection=True;'

    input_query = '''SELECT
    ss_customer_sk AS customer,
    ROUND(COALESCE(returns_count / NULLIF(1.0*orders_count, 0), 0), 7) AS orderRatio,
    ROUND(COALESCE(returns_items / NULLIF(1.0*orders_items, 0), 0), 7) AS itemsRatio,
    ROUND(COALESCE(returns_money / NULLIF(1.0*orders_money, 0), 0), 7) AS monetaryRatio,
    COALESCE(returns_count, 0) AS frequency
    FROM
    (
      SELECT
        ss_customer_sk,
        -- return order ratio
        COUNT(distinct(ss_ticket_number)) AS orders_count,
        -- return ss_item_sk ratio
        COUNT(ss_item_sk) AS orders_items,
        -- return monetary amount ratio
        SUM( ss_net_paid ) AS orders_money
      FROM store_sales s
      GROUP BY ss_customer_sk
    ) orders
    LEFT OUTER JOIN
    (
      SELECT
        sr_customer_sk,
        -- return order ratio
        count(distinct(sr_ticket_number)) as returns_count,
        -- return ss_item_sk ratio
        COUNT(sr_item_sk) as returns_items,
        -- return monetary amount ratio
        SUM( sr_return_amt ) AS returns_money
    FROM store_returns
    GROUP BY sr_customer_sk ) returned ON ss_customer_sk=sr_customer_sk'''

    # Define the columns we wish to import.
    column_info = {
        "customer": {
            "type": "integer"
        },
        "orderRatio": {
            "type": "float"
        },
        "itemsRatio": {
            "type": "float"
        },
        "frequency": {
            "type": "integer"
        }
    }

    data_source = revoscale.RxSqlServerData(sql_query=input_query,
                                            column_Info=column_info,
                                            connection_string=conn_str)
    revoscale.RxInSqlServer(connection_string=conn_str,
                            num_tasks=1,
                            auto_cleanup=False)
    # import data source and convert to pandas dataframe.
    customer_data = pd.DataFrame(revoscale.rx_import(data_source))
    #print("Data frame:", customer_data.head(n=5))

    cdata = customer_data
    n_clusters = 4

    means_cluster = sk_cluster.KMeans(n_clusters=n_clusters, random_state=111)
    columns = ["orderRatio", "itemsRatio", "monetaryRatio", "frequency"]
    est = means_cluster.fit(customer_data[columns])
    clusters = est.labels_
    customer_data['cluster'] = clusters

    # Print some data about the clusters:

    # For each cluster, count the members.
    for c in range(n_clusters):
        cluster_members = customer_data[customer_data['cluster'] == c][:]
        print('Cluster{}(n={}):'.format(c, len(cluster_members)))
        print('-' * 17)

    # Print mean values per cluster.
    print(customer_data.groupby(['cluster']).mean())
plot_dendrogram(dist)

##############################################################################
##############################################################################

num_clusters = input(
    'How many clusters would you like?    ')  # check dendrogram

##############################################################################
##############################################################################

#### K-means clustering ####
#num_clusters = len(sample['author'].unique()) # set number of clusters to number of unique authors

km = cluster.KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()  # list of clusters

# create doc attribute df including cluster assignment
revisions = {
    'time': sample.index,
    'author': sample['author'].tolist(),
    'text': texts,
    'cluster': clusters
}
frame = pd.DataFrame(revisions,
                     index=[clusters],
                     columns=['time', 'author', 'cluster', 'text'])

print('Cluster value counts:')
示例#12
0
plt.scatter(x[:,0], x[:,1], c=y_pred, cmap='viridis', marker='^', s=200, edgecolor='k')
plt.axis([0, 12, 0, 15])
plt.show()


# #### 실제 데이터에 K-평균 모델 적용

# In[7]:


#p282
from sklearn.cluster import KMeans
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target
k_means = cluster.KMeans(n_clusters=3)
k_means.fit(X_iris)
print(k_means.labels_[::10])
print(y_iris[::10])


# In[8]:


from sklearn import cluster, datasets
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data[:, 2:]
y = iris.target
X1 = iris.data[[1, 50, 100], 2:]
y1 = iris.target[[1, 50, 100]]
示例#13
0
    vectorizer.fit(text_corpora)
    vector = vectorizer.transform(text_corpora)
    return vector
stopword_list = list(set(nl.stopwords.words('english')))
r_df = pd.read_csv("python4.csv",encoding = "ISO-8859-1")
print(r_df)


text_corpora = [s.translate(str.maketrans("","","0123456789")) for s in r_df.loc[:,"scraptweets"]]
words_data = [nt.word_tokenize(s.lower()) for s in text_corpora]
words_data = [[ ps.stem(word) for word in sent if word not in stopword_list ] for sent in words_data  ]
sent_data  = [" ".join(sent) for sent in words_data]

vector = generate_tfidf(sent_data)

kmeans_obj = km.KMeans(n_clusters = 5, max_iter=100)
clusters = kmeans_obj.fit(vector)

r_df["label"]=clusters.labels_  
print("cluster 1")

r_df.loc[r_df["label"]==0]
print(r_df.loc[r_df["label"]==1])
r_df.to_csv("Clustered_tweet2.txt",index=False)

file = open('Clustered_tweet1.txt', encoding="utf8",)
a= file.read()
stopword_list = list(set(nl.stopwords.words('english')))

wordCount = {}
for word in a.lower().split():
示例#14
0
from sklearn import datasets
from sklearn import cluster
import matplotlib.pyplot as plt

data, label = datasets.make_blobs(n_samples=500, n_features=2, centers=5)

e = cluster.KMeans(n_clusters=5)
e.fit(data)

print(e.labels_)
print(e.cluster_centers_)

plt.scatter(data[:, 0], data[:, 1], marker="o", c=e.labels_, edgecolors="k")
plt.scatter(e.cluster_centers_[:,0], e.cluster_centers_[:,1], marker="x")

plt.show()
                                 columns=df_cluster.columns)

# type(df_std)

# Creating clusters
from sklearn import cluster
from sklearn.metrics import silhouette_score

# Using Silhoutter score to understand the right number of clusters.
# Higher the silhoutter score, better . So usually the k that maximizes Silhoutter score should be selected
# Read Below link for interpretation
# http://stackoverflow.com/questions/23687247/efficient-k-means-evaluation-with-silhouette-score-in-sklearn
# USe this only as a guideline as this method has it's limitations . Business Judgement takes precedence

for k in range(2, 11):
    kmeans = cluster.KMeans(n_clusters=k)
    kmeans.fit(df_cluster_scaled)
    label = kmeans.labels_
    sil_coeff = silhouette_score(df_cluster_scaled, label, metric='euclidean')
    print("k={}, The Silhouette Coefficient is {}".format(k, sil_coeff))

# Number of clusters
k = 6
kmeans = cluster.KMeans(n_clusters=k)
kmeans.fit(df_cluster_scaled)  # fitting cluster

# Scoring and analyzing cluster

# caution ----Ideally a training sample needs to separated out for all this analysis . Not doing it because
# there are only 800 customers . Analyzing on Test sample
# Assigning cluster to each row in the ORIGINAL data .
示例#16
0
    X_train, X_validation, Y_train, Y_validation = train_test_split(
        X, Y, test_size=0.3, random_state=seed[i])

    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, Y_train)
    predictions = knn.predict(X_validation)
    acc1 = accuracy_score(Y_validation, predictions)
    total.append(acc1)

print("varinace: ", np.var(total))
print("accuracy: ", np.mean(total))

#-------------------------------------------------K-means-----------------------------------------------

dataset = data.iloc[:, 0:4].values
kmeans = cluster.KMeans(n_clusters=3).fit_predict(dataset)

plt.scatter(dataset[kmeans == 0, 0],
            dataset[kmeans == 0, 1],
            s=100,
            c='red',
            label='Iris-setosa')
plt.scatter(dataset[kmeans == 1, 0],
            dataset[kmeans == 1, 1],
            s=100,
            c='blue',
            label='Iris-versicolour')
plt.scatter(dataset[kmeans == 2, 0],
            dataset[kmeans == 2, 1],
            s=100,
            c='green',
示例#17
0
from tqdm import tqdm

dataset = data.load_iris()

# %%
#Define os parâmetros para o treinamento e teste
test_size = 0.5
ncenters = 3
n_testes = 1000
f1_list = []

for _ in tqdm(range(n_testes)):
    #Dividindo o dataset em treinamento e checagem
    data_train, data_test, _, label_test = train_test_split(
        dataset.data, dataset.target, test_size=test_size)

    #Implementa o Algoritmo Fuzzy C-means
    kmean_obj = skc.KMeans(n_clusters=ncenters, max_iter=10000)
    kmean_obj.fit(data_train)

    #A partir dos centros gerados testa o dataset
    predicted = kmean_obj.predict(data_test)

    f1_list.append(f1_score(label_test, predicted, average='weighted'))

f1_mean = np.mean(f1_list)
f1_error = np.std(f1_list) / np.sqrt(len(f1_list))
print('\nEm {:d} testes:'
      '\nF1 Score = {:.04f} +/- {:.04f}' \
      .format(n_testes, f1_mean, f1_error))
示例#18
0
def kmeans_clust(som, n_clusters=8):
    print("Performing K-means clustering to SOM trained data...")
    cl_labels = clust.KMeans(n_clusters=n_clusters, random_state=tfpinit.km_seed).fit_predict(som.codebook.matrix)

    return cl_labels
示例#19
0
def PCA_graph(INPUT_FILE, DATASET_LABEL):
    """uses the PCA plink does and generates plot and uses k-mean cluster to determine outliers to remove"""
    def SuperPop(x):
        if x in ["GBR", "CEU", "TSI", "FIN", "IBS"]:
            return "EUR"
        elif x in ["CHB", "JPT", "CHS", "CDX", "KHV"]:
            return "EAS"
        elif x in ["YRI", "LWK", "GWD", "MSL", "ESN", "ASW", "ACB"]:
            return "AFR"
        elif x in ["MXL", "PUR", "CLM", "PEL"]:
            return "AMR"
        elif x in ["GIH", "PJL", "BEB", "STU", "ITU"]:
            return "SAS"
        else:
            return "Samples"

    ## Starting to handle big data so bringing in Pandas
    raw = pd.read_csv(INPUT_FILE, sep=" ", header=None)
    ## put 1000g data into superpopulation groups and define dataset
    clean = (raw[list(raw.columns[:4])])
    clean.columns = ['FAM_ID', 'ID', 'C1', 'C2']
    clean.set_index(['FAM_ID'], inplace=True)
    ## setting up super population codes to map colours for graph
    clean["POP"] = clean.ID.apply(SuperPop)
    groups = clean.groupby('POP')
    ## Plotting
    fig, ax = plt.subplots()
    ax.margins(0.1)
    for name, group in groups:
        ax.plot(group.C1, group.C2, marker='o', linestyle='', ms=4, label=name)
    ax.legend(numpoints=1, loc='best')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.suptitle("PCA on " + DATASET_LABEL, weight='bold')
    fig.savefig(DATASET_LABEL + ".PCA_results.pdf")
    plt.close()
    ##kmean clustering to find outliers
    find_out = clean[['C1', 'C2']].copy()
    k_means = cluster.KMeans(n_clusters=5, )
    k_means.fit(find_out)
    centroids = k_means.cluster_centers_
    labels = k_means.labels_
    results = pd.DataFrame([clean.index, labels]).T
    results.columns = ["FAM_ID", "k_group"]
    results["ID"] = clean[["ID"]].copy()
    results.set_index(['FAM_ID'], inplace=True)
    output_label = (DATASET_LABEL + ".PCA_kmeans.txt")
    ## Display samples that are not Europeans in dataset
    merge_df = pd.merge(clean, results, right_index=True, left_index=True)
    merge_df['k_group'] = merge_df['k_group'].astype(int)
    test = merge_df.loc[merge_df['POP'] == "EUR", ['k_group']].apply(np.median)
    Euro_group = int(test)
    #print ("European cluster is :" + str(Euro_group))
    your_samples = merge_df.loc[merge_df['POP'] == "Samples", ['k_group']]
    your_samples['check'] = np.where(your_samples['k_group'] == Euro_group,
                                     'good', 'bad')
    bad_ids = your_samples[your_samples['check'] == 'bad']
    after = (clean[~clean.index.isin(bad_ids.index)])
    count = len(bad_ids.index.get_level_values(0))
    #print (str(count) + " Samples fall outside the European cluster ")
    after_groups = after.groupby('POP')
    ### Plotting with outliers removed
    fig, ax = plt.subplots()
    ax.margins(0.1)
    for name, after_groups in after_groups:
        ax.plot(after_groups.C1,
                after_groups.C2,
                marker='o',
                linestyle='',
                ms=4,
                label=name)
    ax.legend(numpoints=1, loc='best')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.suptitle("Outliers removed PCA on " + DATASET_LABEL + " - " +
                 str(count) + " Samples were removed",
                 weight='bold')
    #print ("Graph saved as " + DATASET_LABEL + ".PCA_results.pdf")
    #print ("Outliers removed Graph saved as " + DATASET_LABEL + ".outliers_removed_PCA_results.pdf")
    fig.savefig(DATASET_LABEL + ".outliers_removed_PCA_results.pdf")
    output_id = (DATASET_LABEL + ".outliers.txt")
    #print ("bad IDs exported to text file : " + output_id)
    bad_ids.to_csv(output_id, sep="\t", header=None)
    plt.close()
示例#20
0
def clusteringmap_category(ax,sm,n_clusters,dataset,colorcategory,labels, savepath):
    """
    Description:
    This function is used to output maps that prints colors on dots based
    on their properties
    """
    categories = dataset[colorcategory] #if colorcategory is one col of the dataset
    cmap = plt.get_cmap("tab20") #cmap for background
    n_palette = 20  # number of different colors in this color palette
    color_list = [cmap((i % n_palette)/n_palette) for i in range(n_clusters)]
    msz = sm.codebook.mapsize
    proj = sm.project_data(sm.data_raw)
    coord = sm.bmu_ind_to_xy(proj)

    fig, ax = plt.subplots(1, 1, figsize=(30,30))

    cl_labels = clust.KMeans(n_clusters=n_clusters,random_state=555).fit_predict(sm.codebook.matrix)

    # fill each rectangular unit area with cluster color
    #  and draw line segment to the border of cluster
    norm = mpl.colors.Normalize(vmin=0, vmax=n_palette, clip=True)
    #ax.pcolormesh(cl_labels.reshape(msz[0], msz[1]).T % n_palette,
                #cmap=cmap, norm=norm, edgecolors='face', #xinyuewang, make the background to white
                #lw=0.5, alpha=0.5) # config for each grid

    ax.scatter(coord[:, 0]+0.5, coord[:, 1]+0.5, c='k', marker='o')
    ax.axis('off')

    categoryname = list(dataset.groupby(colorcategory).count().index)
    categories_int = categories.apply(categoryname.index)

    N = len(categoryname)
    cmap_labels = plt.cm.gist_ncar
    # extract all colors from the .jet map
    cmaplist = [cmap_labels(i) for i in range(cmap_labels.N)]
    # create the new map
    cmap_labels = cmap_labels.from_list('Custom cmap', cmaplist, cmap_labels.N)
    # define the bins and normalize
    bounds = np.linspace(0,N,N+1)
    norm_labels = mpl.colors.BoundaryNorm(bounds, cmap_labels.N)

    scat = ax.scatter(coord[:, 0]+0.5, coord[:, 1]+0.5, c=categories_int,s=30,cmap=cmap_labels,norm=norm_labels)# s is the size of projection dot
    cbar = plt.colorbar(scat, spacing='proportional',ticks=bounds)
    cbar.ax.get_yaxis().set_ticks([])

    for j, lab in enumerate(categoryname):
        cbar.ax.text(1, (2 * j + 1) / (2*(len(categoryname))), lab, ha='left', va='center', fontsize=30)
    cbar.ax.get_yaxis().labelpad = 15
    # cbar.ax.set_ylabel('# of contacts', rotation=270)
    ax.axis('off')


    for label, x, y in zip(labels, coord[:, 0], coord[:, 1]):
        x += 0.2
        y += 0.2
        # "+ 0.1" means shift of label location to upperright direction

        # randomize the location of the label
        #   not to be overwrapped with each other
        x += 0.1 * np.random.randn()
        y += 0.3 * np.random.randn()

        # wrap of label for chemical compound
        #label = str_wrap(label)

        ax.text(x+0.4, y+0.4, label, horizontalalignment='left', verticalalignment='bottom',rotation=30, fontsize=12, weight='semibold')
    # cl_labels = som.cluster(n_clusters)
    cl_labels = clust.KMeans(n_clusters = n_clusters, random_state = 555).fit_predict(sm.codebook.matrix)

    for i in range(len(cl_labels)):
        rect_x = [i // msz[1], i // msz[1],
                i // msz[1] + 1, i // msz[1] + 1]
        rect_y = [i % msz[1], i % msz[1] + 1,
                i % msz[1] + 1, i % msz[1]]

        if i % msz[1] + 1 < msz[1]:  # top border
            if cl_labels[i] != cl_labels[i+1]:
                ax.plot([rect_x[1], rect_x[2]],
                        [rect_y[1], rect_y[2]], 'k-', lw=10)# boundary linewid,orginally2.5

        if i + msz[1] < len(cl_labels):  # right border
            if cl_labels[i] != cl_labels[i+msz[1]]:
                ax.plot([rect_x[2], rect_x[3]],
                        [rect_y[2], rect_y[3]], 'k-', lw=10)#2.5

    plt.savefig(savepath)
    return cl_labels
示例#21
0
def eval_batch(x_train, y_train, x_test, y_test, classifier, components,
               no_clusters, dimensionality):

    cluster_finder = cluster.KMeans(n_clusters=no_clusters)

    if classifier == 'mbk':
        cluster_finder = MiniBatchKMeans(init='k-means++',
                                         n_clusters=no_clusters,
                                         batch_size=32,
                                         max_no_improvement=10,
                                         verbose=0)
        cluster_finder.fit(x)
        cddd = str(cluster_finder.score)
        clll = str(cluster_finder)
        log_str = str(
            components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str(
                components) + dimensionality
        labels = cluster_finder.labels_

    else:
        cluster_finder = cluster.KMeans(n_clusters=no_clusters)
        cluster_finder.fit(x)
        cddd = str(cluster_finder.score)
        clll = str(cluster_finder)
        log_str = str(
            components) + 'score' + cddd + 'algo=' + clll + 'comp=' + str(
                components) + dimensionality
        labels = cluster_finder.labels_

    clustered_x = []
    clustered_y = []

    for c in range(0, no_clusters):
        clustered_x.append([])
        clustered_y.append([])

    for i, item in enumerate(x_train):
        item = item.reshape(1, -1)
        predicted = cluster_finder.predict(item)
        clustered_x[predicted[0]].append(item)
        clustered_y[predicted[0]].append(y[i])

    # here testing
    for q, qtem in enumerate(test_x):
        qtem = qtem.reshape(1, -1)
        predicted = cluster_finder.predict(qtem)[0]
        print('predicted=', predicted)
        #		closest = find_closest(qtem, clustered_x[predicted], clustered_y[predicted], 'mutual_info_score')
        #		closest = find_closest(qtem, clustered_x[predicted], clustered_y[predicted], 'euclidean')
        closest = find_closest(qtem, clustered_x[predicted],
                               clustered_y[predicted], 'cosine')
        print('closest=', closest)
        generated = decode_sequence(closest[1])
        print('generated=', generated)
        actual = decode_sequence(test_y[q])
        print('actual=', actual)
        import nltk

        BLEUscore = nltk.translate.bleu_score.sentence_bleu([generated],
                                                            actual)

        log = log_str + 'bleu-4=' + str(BLEUscore)
        rouge = Rouge()
        scores = rouge.get_scores(generated, actual)
        log = log_str + str(scores)
        log_file.writelines(log + "\n")
        log_file.writelines("---------------------" + '\n')
示例#22
0
def extract_fribbles(stimulus_directory):
    """returns fribbles""" 
    
    # load answer key 
    answer_key = extract_answer_key(stimulus_directory)
    # return list of trial names 
    extract_list = list( answer_key['oddity'])
    # set diameter of images (determined manually)  
    d = int(200/2)
    # identify experiment folder 
    task_folders = [i for i in os.listdir(stimulus_directory) if 'pdf' not in i]
    # import rotation keys 
    rotation = import_rotation_keys(stimulus_directory)
    # stimulus information for loading
    i_folder = [i for i in task_folders if 'oddity' in i][0]
    # identify all files in directory
    files = os.listdir(os.path.join(stimulus_directory, i_folder))
    # initialize kmeans segmentation protocol -- 7 images per screen 
    k_means = cluster.KMeans(n_clusters=7, n_init=4)

    oddity_images = {}
    for i_file in files:
        
        # create human readable filename 
        i_number = i_file[i_file.find('_')+1:len(i_file)-4]
        
        if i_number in extract_list:
            # load image
            i_image = np.array(Image.open(os.path.join(stimulus_directory, i_folder, i_file)))

            ## kmeans segmentation protocol ##
            # binarize layer of image
            binary = i_image[:,:,1] != 255
            # convert binaryized image into a vector for each non-blank location
            points = [[i,j] for i in  range(binary.shape[0]) for j in range(binary.shape[1]) if binary[i,j]]
            # cluster non-blank locations to determine object center of masses
            k_means.fit(points)
            # determine mapping between cluster order and image order
            order = np.array([0 for i in range(7)])
            # determine whether image is in the top row 
            top_row = k_means.cluster_centers_[:,0] < 400
            # extract all top row images             
            order[top_row] = k_means.cluster_centers_[:,1][top_row].argsort().argsort()
            # extract all bottom row images
            order[top_row==0] = k_means.cluster_centers_[:,1][top_row==0].argsort().argsort() + 4
            # sort 
            order = order.argsort()
            ## segmentation complete        ##

            # iterate over all objects in image
            i_slide = []
            for i_segmented_object in range(len(order)):
                
                # identify center of mass for each object 
                x, y = k_means.cluster_centers_[order[i_segmented_object],:]
                # select an area around the center of mass defined above
                i_object =  i_image[int(x-d):int(x+d), int(y-d):int(y+d)]
                # add segmented image 
                i_slide.append(i_object)

            # resize and append to ambiguity type
            oddity_images[i_number] = [ imresize(i_slide[i], (224, 224)) for i in range(len(i_slide)) ]

    return oddity_images
示例#23
0
 def _cluster(self, data, targets):
     clusterer = cluster.KMeans(
         n_clusters=len(set(targets.tolist()))).fit(data)
     return metrics.mutual_info_score(targets, clusterer.labels_)
示例#24
0
def extract_and_rotate_novel_images(stimulus_directory):
    """return novel images in their canonical orientation"""
    
    # load answer key 
    answer_key = extract_answer_key(stimulus_directory)
    # identify task folders 
    task_folders = [i for i in os.listdir(stimulus_directory) if 'pdf' not in i]
    # import rotation keys 
    rotation = import_rotation_keys(stimulus_directory)
    # stimulus information for loading
    i_folder = [i for i in task_folders if 'novel' in i][0]
    # all files 
    files = os.listdir(os.path.join(stimulus_directory, i_folder))
    # initialize image segmentation protocol--four objects per stimulus screen
    k_means = cluster.KMeans(n_clusters=4, n_init=4)
    # initialize data storage 
    novel_images = {'high':{}, 'low':{}}
    # set diameter of images (determined manually) 
    d = int(350/2)

    for i_file in files:
        
        # determine ambiguity level of this trial
        if 'LOW' in i_file: amb = 'low'
        if 'HIG' in i_file: amb = 'high'
        # define human readable filename
        i_number = i_file[i_file.find('_')+1:len(i_file)-4]
        
        # only select those images that we have rotation keys for 
        if (i_file in rotation['novel'][amb].keys()) * (i_number in answer_key['novel_%s'%amb].keys()):

            # load rotations necessary for image
            image_rotations = rotation['novel'][amb][ i_file ]
            # load image
            i_image = np.array(Image.open(os.path.join(stimulus_directory, i_folder, i_file)))

            ##### begin kmeans segmentation protocol #####
            # binarize layer of image
            io = i_image[:,:,1] != 255
            # convert binaryized image into a vector for each non-blank location
            points = [[i,j] for i in  range(io.shape[0]) for j in range(io.shape[1]) if io[i,j]]
            # cluster non-blank locations to determine object center of masses
            k_means.fit(points)
            # determine mapping between cluster order and image order
            order = determine_order_of_clusters(k_means.cluster_centers_, i_image)
            ##### end segmentation protocol #######

            # iterate over all objects in image
            i_slide = []
            for i_segmented_object in range(len(order)):
                
                # identify centroids of clusters 
                x, y = k_means.cluster_centers_[order[i_segmented_object],:]
                # select an area around the center of mass defined above
                i_object =  i_image[int(x-d):int(x+d), int(y-d):int(y+d)]
                # rotate the selected area into its connonical orientation
                i_object_rotated = np.rot90(i_object, k=image_rotations[i_segmented_object])
                # add to list
                i_slide.append(i_object_rotated)

            # resize images
            i_slide = [imresize(i_slide[i],(224, 224)) for i in range(len(i_slide))]
            # append to ambiguity type
            novel_images[amb][i_number] = i_slide

    return novel_images
示例#25
0
    correct = (mapped_preds == targets).sum()
    total = len(targets)
    acc = correct / (total + eps)

    cm = metrics.confusion_matrix(targets, mapped_preds)
    return loss, acc, cm


df = pd.read_csv("ATNTFaceImages.txt", header=None)
df_matrix = df.to_numpy()
inputs = df_matrix[1:].T
targets = df_matrix[0].T

k = 40
model = cluster.KMeans(k)
model.fit(inputs)
loss, acc, cm = kmeans_metric(model, inputs, targets)
print(f"Loss: {loss:.5f} Acc: {acc:.5f}")
print("confusion matrix:")
print(cm)

df = pd.read_csv("HandWrittenLetters.txt", header=None)
df_matrix = df.to_numpy()
inputs = df_matrix[1:].T
targets = df_matrix[0].T

k = 26
model = cluster.KMeans(k)
model.fit(inputs)
loss, acc, cm = kmeans_metric(model, inputs, targets)
示例#26
0
def clasterize_n(data, n):
    kmean = cluster.KMeans(n_clusters=n, init='k-means++', random_state=241)
    X = preprocessing.scale(list(map(lambda el: el['time'], data)))
    kmean.fit(X)
    return kmean.labels_
        df.loc[county, 'MHI'] = liquor.loc[county, 'Median_Household_Income']
    return df

# Sets a custom color palette for clusters
colors = ['r','b','y','g','c','m']
def getColors(labels):
    return [colors[l] for l in labels]


# In[251]:


# Setup kmeans
klean = lean[['VODKA', 'WHISKEY']] 
kaveraged = averaged[['VODKA', 'WHISKEY']] 
kmeans = sk.KMeans(n_clusters=3, random_state=0).fit(klean)

# Plot
fig,ax = plt.subplots(1)
plt.scatter(klean['VODKA'], klean['WHISKEY'], color=getColors(kmeans.labels_), label=kmeans.labels_)
plt.title('Vodka and Whisky Percentage of Sales - Median by County, Filtered, 3 Clusters')
plt.xlabel('Vodka Percentage of Sales - Median')
plt.ylabel('Whisky Percentage of Sales - Median')
# Add rectangle
rect = patches.Rectangle((-0.02,-0.02), 0.04, 0.04, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
plt.show()
fig.savefig('plots/Lean3Clusters.png')


# In[252]:
示例#28
0
def get_kmeans(clusters):
    return cluster.KMeans(n_clusters=clusters,
                          init='k-means++',
                          random_state=241,
                          tol=0.1)
示例#29
0
def estim_class_model(features,
                      nb_classes,
                      estim_model='GMM',
                      pca_coef=None,
                      use_scaler=True,
                      max_iter=99):
    """ create pipeline (scaler, PCA, model) over several options how
    to cluster samples and fit it on data

    :param ndarray features:
    :param int nb_classes: number of expected classes
    :param float pca_coef: range (0, 1) or None
    :param bool use_scaler: whether use a scaler
    :param str estim_model: used model
    :param int max_iter:
    :return:

    >>> np.random.seed(0)
    >>> fts = np.row_stack([np.random.random((50, 3)) - 1,
    ...                     np.random.random((50, 3)) + 1])
    >>> mm = estim_class_model(fts, 2)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='GMM_kmeans',
    ...                         pca_coef=0.95, max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='GMM_Otsu', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='kmeans_quantiles',
    ...                         use_scaler=False, max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='BGM', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    >>> mm = estim_class_model(fts, 2, estim_model='Otsu', max_iter=3)
    >>> mm.predict_proba(fts).shape
    (100, 2)
    """
    components = []
    if use_scaler:
        components += [('std_scaler', preprocessing.StandardScaler())]
    if pca_coef is not None:
        components += [('reduce_dim', decomposition.PCA(pca_coef))]

    nb_inits = max(1, int(np.sqrt(max_iter)))
    # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html
    mm = mixture.GaussianMixture(n_components=nb_classes,
                                 covariance_type='full',
                                 n_init=nb_inits,
                                 max_iter=max_iter)

    # split the model and used initilaisation
    if '_' in estim_model:
        init_type = estim_model.split('_')[-1]
        estim_model = estim_model.split('_')[0]
    else:
        init_type = ''

    y = None
    if estim_model == 'GMM':
        # model = estim_class_model_gmm(features, nb_classes)
        if init_type == 'kmeans':
            mm.set_params(n_init=1)
            # http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
            kmeans = cluster.KMeans(n_clusters=nb_classes,
                                    init='k-means++',
                                    n_jobs=-1)
            y = kmeans.fit_predict(features)
        elif init_type == 'Otsu':
            mm.set_params(n_init=1)
            y = compute_multivarian_otsu(features)

    elif estim_model == 'kmeans':
        # http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GMM.html
        mm.set_params(max_iter=1)
        init_type = 'quantiles' if init_type == 'quantiles' else 'k-means++'
        _, y = estim_class_model_kmeans(features,
                                        nb_classes,
                                        init_type=init_type,
                                        max_iter=max_iter)

        logging.info('compute probability of each feature to all component')

    elif estim_model == 'BGM':
        mm = mixture.BayesianGaussianMixture(n_components=nb_classes,
                                             covariance_type='full',
                                             n_init=nb_inits,
                                             max_iter=max_iter)

    elif estim_model == 'Otsu' and nb_classes == 2:
        mm.set_params(max_iter=1, n_init=1)
        y = compute_multivarian_otsu(features)

    components += [('model', mm)]
    # compose the pipeline
    model = pipeline.Pipeline(components)

    if y is not None:
        # fit with examples
        model.fit(features, y)
    else:
        # fit from scrach
        model.fit(features)
    return model
示例#30
0
def cluster_weights(weights, n_clusters):
    from sklearn import cluster
    kmeans = cluster.KMeans(n_clusters=n_clusters).fit(weights.reshape(
        (-1, 1)))
    return kmeans.labels_.reshape(weights.shape), np.around(
        kmeans.cluster_centers_).astype(np.int32)