예제 #1
0
 def find_optimalCluster(self): # Mencari Jumlah Cluster yang Optimal
     # Melakukan Iterasi untuk Mendapatkan nilai Cost  
     cost = {}  
     for k in range(2,10):  
         kproto = KPrototypes(n_clusters = k, random_state = 75)  
         kproto.fit_predict(self.df_model, categorical = [0,1,2])  
         cost[k]= kproto.cost_
     
     # Memvisualisasikan Elbow Plot  
     sns.pointplot(x = list(cost.keys()), y = list(cost.values()))  
     plt.show()
예제 #2
0
파일: main.py 프로젝트: dgcnz/k-all
def exec_kprototypes(df, choices_obj):

    print("Whitening data...", end='', flush=True)
    for header in choices_obj['numerical']:
        df[header + "_scaled"] = whiten(df[header])
    print("Done.")
    nums_scaled = [header + "_scaled" for header in choices_obj['numerical']]
    cats_not_scaled = [header for header in choices_obj['categorical']]

    X = pd.concat(
        [df[nums_scaled].astype(float), df[cats_not_scaled].astype(str)],
        axis=1)
    k = int(input("Number of clusters:\n > "))
    kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)

    df['cluster_labels'] = kproto.fit_predict(
        X.values,
        categorical=list(
            range(len(X.columns) - len(cats_not_scaled), len(X.columns))))
    if (len(nums_scaled) >= 2):
        # Plot clusters
        print("Only showing 2 dimensions of data (picking first two headers)")
        sns.scatterplot(x=nums_scaled[0],
                        y=nums_scaled[1],
                        hue='cluster_labels',
                        data=df)
        plt.show()
예제 #3
0
def kprototypesCluster(features: np.array, catCols: list, nClust: int):

    #Convert continous features to astype float
    model = KPrototypes(n_clusters=nClust, verbose=2)
    clusters = model.fit_predict(features, categorical=catCols)

    return model
예제 #4
0
    def kproto(self, K=20, N=int(1e5), MN=4, T=10, type='cao', save=True):
        data = self.to_numpy()

        M = data.shape[1]
        # MN = 22
        if type == 'huang':
            model = KPrototypes(n_clusters=K,
                                init='Huang',
                                n_init=1,
                                verbose=1)
        if type == 'cao':
            model = KPrototypes(n_clusters=K,
                                init='Cao',
                                verbose=2,
                                max_iter=10000)

        clusters = model.fit_predict(
            data,
            categorical=[0, 3, 6, 8] if self.cl_type == 'prop' else [
                0, 2, 3, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
            ])
        if save:
            self.save(model, 'Clustering_kproto_model')
        return np.array(model.cluster_centroids_[0]), np.array(
            model.cluster_centroids_[1]), np.array(clusters)
예제 #5
0
    def predict(self):
        with open(self.data_processed, 'rb') as f:
            self.dataset = pickle.load(f)
        with open(self.label_file, 'rb') as f:
            self.label = pickle.load(f)

        # self.y_pred = KMeans(n_clusters=5, random_state=9).fit_predict(self.dataset)
        # np.savetxt(self.cluster_result, np.hstack(self.y_pred, self.dataset) , delimiter=',')
        # score = metrics.calinski_harabaz_score(self.dataset, self.y_pred)
        # print(score)
        kproto = KPrototypes(n_clusters=5, init='Cao', verbose=2)
        clusters = kproto.fit_predict(self.dataset, categorical=[1])

        temp = np.loadtxt(fname=self.data_cleaned, dtype=object, delimiter=',')
        room_identity = temp[1:, :3]

        self.result = np.column_stack((room_identity, self.dataset, clusters))

        print(kproto.cluster_centroids_)
        # Print training statistics
        print(kproto.cost_)
        print(kproto.n_iter_)

        with open(self.result_binary, 'wb') as f:
            pickle.dump(self.result, f)
        with open('kproto_res', 'wb') as f:
            pickle.dump(kproto, f)

        with open(self.cluster_result, 'w') as f:
            re = self.result.tolist()
            for line in re:
                f.write("\t".join(list(map(str, line))) + '\n')

        for s, c in zip(self.label, clusters):
            print("Room identity: {}, cluster:{}".format(s, c))
def kprotoypes_cluster(df, n_clusters, category, hover_text):
    datadf = df.loc[:, df.columns != hover_text]
    kmodes_instance = KPrototypes(n_clusters=n_clusters, init='Cao', verbose=2)
    clusters = kmodes_instance.fit_predict(datadf, categorical=category)
    data_array = np.array(datadf.to_numpy().tolist())
    col_len = len(datadf.columns)
    if (col_len == 2):
        clus = scat2d(data_array, clusters, hover_text, df)
        return clus
    else:
        clus = scat3d(data_array, clusters, hover_text, df)
        return clus
예제 #7
0
def KPrototypes_cluster(input_data, k_clusters):
    from kmodes.kprototypes import KPrototypes
    #normalized data
    normalized = preprocessing.StandardScaler()
    input_data[input_data.select_dtypes(
        include=['float', 'integer']).columns] = normalized.fit_transform(
            input_data[input_data.select_dtypes(
                include=['float', 'integer']).columns])
    input_data = input_data.as_matrix()

    kproto = KPrototypes(n_clusters=k_clusters, init='Cao', verbose=2)
    clus_kmeans_fit = kproto.fit_predict(input_data,
                                         categorical=[0, 1, 2, 3, 4, 5, 6, 7])
    return (clus_kmeans_fit)
def plot_costs(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    return: list of costs
    """
    k_values = range(min_k, max_k + 1)
    costs = []
    for k in k_values:
        kp = KPrototypes(n_clusters=k,
                         init='Cao',
                         n_init=22,
                         verbose=0,
                         random_state=4,
                         n_jobs=4)
        kp.fit_predict(X, categorical=[1, 2, 3])
        costs.append(kp.cost_)
    plt.plot(k_values, costs)
    plt.xlabel('k')
    plt.ylabel('costs')
    plt.show()
    plt.savefig("../image/kprototype_costs.png")
    return costs
예제 #9
0
파일: exp.py 프로젝트: pzhang101/nopticon
def cluster(summ, agg_classes=None):
    """
    Clusters summary info using DBSCAN if agg_classes is provided it uses K-Prototypes
    """
    all_prop = None
    prop = {}
    ranks = []
    for flow, edge in summ.get_flowedges():
        rank = round(summ.get_edge_rank(flow, edge), 2)
        if rank < 0.5:
            continue
        # else:
        # print(flow, edge, rank)

        policy = nopticon.ReachabilityPolicy({
            'flow': flow,
            'source': edge[0],
            'target': edge[1]
        })
        prop[policy] = len(ranks)
        if agg_classes is not None:
            ranks.append([rank, agg_classes[edge[0]], agg_classes[edge[1]]])
        else:
            ranks.append([rank])

    if agg_classes is not None:
        kproto = KPrototypes(n_clusters=3, init='Huang')
        clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2])
    else:
        agg = KMeans(n_clusters=2, n_jobs=2)  # linkage="complete")
        clust = agg.fit(ranks).labels_

    assert len(clust) == len(ranks)
    means = {}
    high = None
    for k in set(clust):
        kranks = [ranks[idx][0] for idx in prop.values() if clust[idx] == k]
        means[k] = sum(kranks) / len(kranks)
        if high is None or means[k] > means[high]:
            high = k

    for p, idx in prop.items():
        if clust[idx] == high:
            # print("\tHIGH:", ranks[idx], p)
            summ.mark_cluster_accepted(p.flow(), p.edge())
        # else:
        # print("\tlow:", ranks[idx], p)
    return
예제 #10
0
def kprototypes_compute_metrics_for_every_cluster_number(
        clusters_range_lower_bound,
        clusters_range_upper_bound,
        dataset,
        distance_algorithm,
        init_Cao_or_Huang_for_kprototypes,
        list_categorical_features_indeces_for_kprototypes,
        print_optimum_metrics=True):

    kprototypes_list_metrics = []

    for num_of_clusters in range(clusters_range_lower_bound,
                                 clusters_range_upper_bound):

        kprototypes = KPrototypes(n_clusters=int(num_of_clusters),
                                  init=str(init_Cao_or_Huang_for_kprototypes),
                                  n_init=50,
                                  verbose=0)
        predictions = kprototypes.fit_predict(
            dataset,
            categorical=list_categorical_features_indeces_for_kprototypes)
        centers = kprototypes.cluster_centroids_
        cost_function = kprototypes.cost_
        num_jobs = kprototypes.n_iter_
        error_metric = cost_function
        silhouette = silhouette_score(dataset, predictions, distance_algorithm)

        kprototypes_list_metrics.append({
            'clusters': num_of_clusters,
            'silhouette': silhouette,
            'error': error_metric,
            'num_jobs': num_jobs
        })
        if print_optimum_metrics is True:
            print(
                "For n_clusters = {}, silhouette score is {}, cluster_errors is {}, "
                "n_jobs {})".format(num_of_clusters, silhouette, error_metric,
                                    num_jobs))

    return kprototypes_list_metrics
예제 #11
0
    def agruparDados(self, file):

        style.use("ggplot")
        caminho = 'C:/Users/Teste/Desktop/10 semestre/tcc2/Arquivos de Logs/Arquivos de Logs/Ameaças/Novos/trainThreats.csv'
        colors = [
            'b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen'
        ]
        # Data points with their publisher name,category score, category name, place name
        #category = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 9]  # categoria
        #severity = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 8]  # severidade
        X = np.genfromtxt(caminho, dtype=object, delimiter=',',
                          skip_header=1)[:, 1:]

        kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2)
        clusters = kproto.fit_predict(
            X, categorical=[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14])

        file['Clusters'] = clusters

        # Print cluster centroids of the trained model.
        print(kproto.cluster_centroids_)
        # Print training statistics
        print(kproto.cost_)
        print(kproto.n_iter_)
        print(kproto.gamma)
        '''plt.scatter(X[clusters == 0, 8], X[clusters == 0, 9], c='purple', alpha=0.5, s=150,  label='Cluster 0')
        plt.scatter(X[clusters == 1, 8], X[clusters == 1, 9], c='black', alpha=0.5, s=150,  label='Cluster 1')
        plt.scatter(X[clusters == 2, 8], X[clusters == 2, 9], c='red', alpha=0.5, s=150,  label='Cluster 2')
        plt.scatter(X[clusters == 3, 8], X[clusters == 3, 9], c='green', alpha=0.5, s=150,  label='Cluster 3')
        plt.scatter(X[clusters == 4, 8], X[clusters == 4, 9], c='blue', alpha=0.5, s=100, label='Cluster 4')
        plt.scatter(X[clusters == 5, 8], X[clusters == 5, 9], c='yellow', alpha=0.5, s=100, label='Cluster 5')
        plt.xlabel('Severity')
        plt.ylabel('Category')
        plt.legend()
        plt.show()'''

        self.lerXML(file)
예제 #12
0
 
X = df.iloc[:, 1:5]
X.columns = ['a','b','c','d']
X.head()
 
min_max_scaler = preprocessing.MinMaxScaler() 
bcd = X.iloc[:,1:4]
x_scaled = min_max_scaler.fit_transform(bcd)
X_scaled = pd.DataFrame(x_scaled,columns=bcd.columns)
X = pd.concat([df['a'],X_scaled], axis=1)
 
X_matrix = X.values
cost = []
for num_clusters in list(range(1,5)):
    kproto = KPrototypes(n_clusters=num_clusters, init='Cao')
    kproto.fit_predict(X_matrix, categorical=[0])
    cost.append(kproto.cost_)
    
plt.plot(cost)
pd.DataFrame(cost)
 
kproto = KPrototypes(n_clusters=1, init='Cao')
clusters = kproto.fit_predict(X_matrix, categorical=[0])
print('====== Centriods ======')
kproto.cluster_centroids_
print()
print('====== Cost ======')
kproto.cost_
 
centroids = pd.concat([pd.DataFrame(kproto.cluster_centroids_[1]),pd.DataFrame(kproto.cluster_centroids_[0])], axis=1)
centroids
예제 #13
0
s = (nc_data_2015.dtypes == 'object')
object_cols = list(s[s].index)
# Append the boolean features:
object_cols.extend(['search_conducted', 'contraband_found', 'is_arrested', 'drugs_related_stop'])
print(object_cols)

# Create a data frame copy to label encode all categorical features:
nc_encoded = nc_data_2015.copy()
label_encoder = LabelEncoder()
for col in object_cols:
    nc_encoded[col] = label_encoder.fit_transform(nc_encoded[col])
    
# Create a subset using only age, gender, and race and use it to perform KPrototypes clustering:
X = nc_encoded.iloc[:, [1,2,3]]
kp = KPrototypes(n_clusters=3, init="Cao", n_init=1, verbose=1)
cluster_labels = kp.fit_predict(X, categorical=[0, 2])
X['Cluster'] = cluster_labels

# Create a count plot showing clusters related to feature values:
plt.figure(figsize=(16,10))
sns.countplot(x='Cluster', hue='driver_race_raw', data=X)
plt.legend(title='Driver Race', labels=['Asian', 'Black Hispanic', 'Black', 'Other', 'Unknown Hispanic', 'Unknown Non-Hispanic', 'White Hispanic', 'White'])
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.title("Driver Race Count Per Cluster")
plt.show()

# Create a count plot showing clusters related to feature values:
plt.figure(figsize=(16,10))
sns.countplot(x='Cluster', hue='driver_gender', data=X, palette='mako')
plt.legend(title='Driver Gender', labels=['Female', 'Male'])
예제 #14
0
import pandas as pd
from kmodes.kprototypes import KPrototypes
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns


# standardizing data
columns_to_normalize = ['RFM_Score','Age']
rfm_encoded[columns_to_normalize] = rfm_encoded[columns_to_normalize].apply(lambda x: (x - x.mean()) / np.std(x))

matrix = rfm_encoded.as_matrix()

# Running K-Prototype clustering
kproto = KPrototypes(n_clusters=3, init='Cao')
clusters = kproto.fit_predict(matrix, categorical=[2])

print(kproto.cluster_centroids_)
print(kproto.cost_)

rfm_encoded['cluster_id'] = clusters

# add cluster_id column to rfm data frame for better understanding
rfm_table['cluster_number']=rfm_encoded['cluster_id'].values

#Checking cluster count
cluster_count = pd.DataFrame(rfm_encoded['cluster_id'].value_counts())
print(cluster_count)

sns.barplot(x=cluster_count.index, y=cluster_count['cluster_id'])
예제 #15
0
#colunas_cat_ = ["grau","NATUREZA","Porte","Estratificacao","RM_OU_RIDE","Atuacao_Vara"]
colunas_cat_ = ["Porte", "Estratificacao", "RM_OU_RIDE", "Atuacao_Vara"]
#colunas_cat = [0, 1, 3, 4, 5, 6]
colunas_cat = [1, 2, 3, 4]

dados = dados.filter(items=colunas)
dados['duracao_dias'] = pd.to_numeric(dados['duracao_dias'])
print(dados.head(10))
print(dados.dtypes)

#km = KModes(n_clusters=6, init='Huang', n_init=10, verbose=1)

km = KPrototypes(n_clusters=6, init='Huang', n_init=2, verbose=1)

print(dados.shape[1])

dados_temp = dados[colunas_cat_]

print(dados_temp)

clusters = km.fit_predict(dados, categorical=colunas_cat)

#clusters = km.fit_predict(dados)

# Print the cluster centroids
print('Centroídes')
print(km.cluster_centroids_)

print('Clusters')
print(clusters)
예제 #16
0
#!/usr/bin/env python

import numpy as np
from kmodes.kprototypes import KPrototypes

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:]
X[:, 0] = X[:, 0].astype(float)

kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2)
clusters = kproto.fit_predict(X, categorical=[1, 2])

# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(syms, clusters):
    print(f"Symbol: {s}, cluster:{c}")
예제 #17
0
#!/usr/bin/env python
import numpy as np
from kmodes.kprototypes import KPrototypes
import pandas as pd

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:]

X[:, 0] = X[:, 0].astype(float)
kproto = KPrototypes(n_clusters=3, init='Cao', verbose=8)
clusters = kproto.fit_predict(
    X, categorical=[1, 2])  #TC: define categorical variables here

# Print cluster centroids of the trained model.
print("\nCluster centroid")
print(kproto.cluster_centroids_)
# Print training statistics
print("\nCost")
print(kproto.cost_)
print("\nNumber of iterations")
print(kproto.n_iter_)
"""for s, c in zip(syms, clusters):
    print("Symbol: {}, cluster:{}".format(s, c))"""

print("\nClustering result")
df = pd.DataFrame(zip(syms, clusters))
df.columns = ["Symbol", "Cluster"]
print(df)
예제 #18
0
import pandas as pd
import numpy as np
from kmodes.kprototypes import KPrototypes
import sys

df = pd.read_csv('dataset.txt', sep=";")

df_array = df.values

df_array[:, 2] = df_array[:, 2].astype(float)

kproto = KPrototypes(n_clusters=3, verbose=2, max_iter=20)
clusters = kproto.fit_predict(df_array, categorical=[0, 1, 3, 4])

cluster_dict = []
for c in clusters:
    cluster_dict.append(c)

df['cluster'] = cluster_dict

c0 = df[df['cluster'] == 0]
c1 = df[df['cluster'] == 1]
c2 = df[df['cluster'] == 2]

# c0 = df[df['cluster']== 0].applymap(lambda s:s.lower() if type(s) == str else s)
# c1 = df[df['cluster']== 1].applymap(lambda s:s.lower() if type(s) == str else s)
# c2 = df[df['cluster']== 2].applymap(lambda s:s.lower() if type(s) == str else s)
# # print(c0['cost range'])
# # print(c1,c2,c0)
# # print("In py")
예제 #19
0
def cluster_clients(k=None,
                    save_centroids=True,
                    save_clusters=True,
                    explain_centroids=True):
    '''
    Runs k-prototype clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :param explain_centroids: Boolean indicating whether to compute LIME explanations for cluster centroids
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        df = pd.read_csv(cfg['PATHS']['PROCESSED_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['PROCESSED_DATA'] +
              ". Run preprocessing script before running this script.")
        return
    client_ids = df.pop('ClientID').tolist()
    if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
        dates = df.pop('Date').tolist()
    df.drop('GroundTruth', axis=1, inplace=True)
    X = np.array(df)

    # Load feature info
    try:
        data_info = yaml.full_load(open(cfg['PATHS']['DATA_INFO'], 'r'))
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['DATA_INFO'] +
              ". Run preprocessing script before running this script.")
        return

    # Get list of categorical feature indices
    noncat_feat_idxs = [
        df.columns.get_loc(c) for c in data_info['NON_CAT_FEATURES'] if c in df
    ]
    cat_feat_idxs = [
        i for i in range(len(df.columns)) if i not in noncat_feat_idxs
    ]

    # Normalize noncategorical features
    X_noncat = X[:, noncat_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, noncat_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k,
                               verbose=1,
                               n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'],
                               init='Cao',
                               num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[noncat_feat_idxs], axis=0), np.expand_dims(x1[noncat_feat_idxs], axis=0)) + \
            k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0), np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    if cfg['TRAIN']['MODEL_DEF'] == 'hifis_rnn_mlp':
        clusters_df = pd.DataFrame({
            'ClientID': client_ids,
            'Date': dates,
            'Cluster Membership': client_clusters
        })
        clusters_df.set_index(['ClientID', 'Date'])
    else:
        clusters_df = pd.DataFrame({
            'ClientID': client_ids,
            'Cluster Membership': client_clusters
        })
        clusters_df.set_index('ClientID')

    # Get centroids of clusters
    cluster_centroids = np.zeros((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, noncat_feat_idxs] = k_prototypes.cluster_centroids_[
        0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[
        1]  # Categorical features
    #cluster_centroids = np.concatenate((k_prototypes.cluster_centroids_[0], k_prototypes.cluster_centroids_[1]), axis=1)

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, noncat_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, noncat_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    cluster_centroids = np.rint(
        cluster_centroids)  # Round centroids to nearest int
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(df.columns))
    for i in range(len(data_info['SV_CAT_FEATURE_IDXS'])):
        idx = data_info['SV_CAT_FEATURE_IDXS'][i]
        ordinal_encoded_vals = cluster_centroids[:, idx].astype(int)
        original_vals = [
            data_info['SV_CAT_VALUES'][idx][v] for v in ordinal_encoded_vals
        ]
        centroids_df[data_info['SV_CAT_FEATURES'][i]] = original_vals
    cluster_num_series = pd.Series(np.arange(1,
                                             cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(
        client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Load objects necessary for prediction and explanations
    try:
        scaler_ct = load(cfg['PATHS']['SCALER_COL_TRANSFORMER'])
        ohe_ct_sv = load(cfg['PATHS']['OHE_COL_TRANSFORMER_SV'])
        explainer = dill.load(open(cfg['PATHS']['LIME_EXPLAINER'], 'rb'))
        model = load_model(cfg['PATHS']['MODEL_TO_LOAD'], compile=False)
    except FileNotFoundError as not_found_err:
        print(
            'File "' + not_found_err.filename +
            '" was not found. Ensure you have trained a model and run LIME before running this script.'
        )
        return

    # Add model's prediction of centroids (classes and prediction probabilities) to the DataFrame
    predicted_classes = []
    prediction_probs = []
    print("Obtaining model's predictions for cluster centroids.")
    for i in tqdm(range(len(cluster_centroids))):
        x = np.expand_dims(cluster_centroids[i], axis=0)
        y = np.squeeze(predict_instance(x, model, ohe_ct_sv, scaler_ct).T,
                       axis=1)  # Predict centroid
        prediction = 1 if y[1] >= cfg['PREDICTION'][
            'THRESHOLD'] else 0  # Model's classification
        predicted_class = cfg['PREDICTION']['CLASS_NAMES'][prediction]
        predicted_classes.append(predicted_class)
        prediction_probs.append(y[1] * 100)  # Include as a percentage
    centroids_df.insert(centroids_df.shape[1],
                        'At risk of chronic homelessness',
                        pd.Series(predicted_classes))
    centroids_df.insert(centroids_df.shape[1],
                        'Probability of chronic homelessness [%]',
                        pd.Series(prediction_probs))

    # Predict and explain the cluster centroids
    if explain_centroids:
        model_def = cfg['TRAIN']['MODEL_DEF'].upper()
        NUM_SAMPLES = cfg['LIME'][model_def]['NUM_SAMPLES']
        NUM_FEATURES = cfg['LIME'][model_def]['NUM_FEATURES']
        exp_rows = []
        explanations = []
        print('Creating explanations for cluster centroids.')
        for i in tqdm(range(cluster_centroids.shape[0])):
            row = []
            exp = predict_and_explain(cluster_centroids[i], model, explainer,
                                      ohe_ct_sv, scaler_ct, NUM_FEATURES,
                                      NUM_SAMPLES)
            explanations.append(exp)
            exp_tuples = exp.as_list()
            for exp_tuple in exp_tuples:
                row.extend(list(exp_tuple))
            if len(exp_tuples) < NUM_FEATURES:
                row.extend([''] * (2 * (NUM_FEATURES - len(exp_tuples)))
                           )  # Fill with empty space if explanation too small
            exp_rows.append(row)
        exp_col_names = []
        for i in range(NUM_FEATURES):
            exp_col_names.extend(
                ['Explanation ' + str(i + 1), 'Weight ' + str(i + 1)])
        exp_df = pd.DataFrame(exp_rows, columns=exp_col_names)
        centroids_df = pd.concat(
            [centroids_df, exp_df], axis=1,
            sort=False)  # Concatenate client features and explanations

        # Visualize clusters' LIME explanations
        predictions = centroids_df[[
            'At risk of chronic homelessness',
            'Probability of chronic homelessness [%]'
        ]].to_numpy()
        visualize_cluster_explanations(
            explanations, predictions, cluster_freqs,
            'Explanations for k-prototypes clusters',
            cfg['PATHS']['IMAGES'] + 'centroid_explanations_')

    # Save centroid features and explanations to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] +
                            datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False,
                            index=False)

    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] +
                           datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False,
                           index=False)
    return k_prototypes
# load the data
md_df = pd.read_csv('markdown_group.csv')
no_md_df = pd.read_csv('no_markdown_group.csv')

# clear the first two columns in both groups
md_df = md_df.drop(['Unnamed: 0', 'nb_id'], axis=1)
no_md_df = no_md_df.drop(['Unnamed: 0', 'nb_id'], axis=1)

# perform k-prototypes clustering on markdown cell group
costs_md = []
K = range(1, 11)
for k in K:
    print("clustering with " + str(k) + " clusters")
    kproto = KPrototypes(n_clusters=k, init='Cao', n_jobs=4, verbose=0)
    clusters = kproto.fit_predict(
        md_df, categorical=[0, 1, 2, 3, 9, 10, 11, 13, 15, 19, 20, 22, 24])
    costs_md.append(kproto.cost_)

# save the costs plot
plt.plot(K, costs_md, 'bx-')
plt.xlabel('k')
plt.ylabel('cost')
plt.title('Cost Graph for Optimal k for Markdown Cell Group')
plt.savefig('figures/10-markdown-kproto.png')

# perform k-prototypes clustering on no markdown cell group
costs_no_md = []
K = range(1, 11)
for k in K:
    print("clustering with " + str(k) + " clusters")
    kproto = KPrototypes(n_clusters=k, init='Cao', n_jobs=4, verbose=0)
예제 #21
0
std = StandardScaler()
for i in num:
    df_copy[i] = std.fit_transform(df_copy[i].values.reshape(-1, 1))

# taking indexes of categorical column
cat_columns_index = [
    df_copy.columns.get_loc(c) for c in cat.columns if c in df_copy.columns
]

# K-Prototypes
from kmodes.kprototypes import KPrototypes

X = df_copy.values
kproto = KPrototypes(n_clusters=12)
clusters = kproto.fit_predict(X, categorical=cat_columns_index)

# adding clusters to data
df_copy['cluster'] = clusters

# creating segments
seg1 = df_copy[df_copy['cluster'] == 0].sort_values(['age'],
                                                    axis=0,
                                                    ascending=True)
seg2 = df_copy[df_copy['cluster'] == 1].sort_values(['age'],
                                                    axis=0,
                                                    ascending=True)
seg3 = df_copy[df_copy['cluster'] == 2].sort_values(['age'],
                                                    axis=0,
                                                    ascending=True)
seg4 = df_copy[df_copy['cluster'] == 3].sort_values(['age'],
예제 #22
0
print("Columns with categorical data")
print(c_data)

d = {}
k = 0
for i in customer.columns:
    d[i] = k
    k = k + 1

customer.rename(columns=d, inplace=True)

c_list = []
cluster_dict = {}
for i in range(1, 11):
    kproto = KPrototypes(n_clusters=i, init='Cao', verbose=2)
    clusters = kproto.fit_predict(customer, categorical=ind)
    cluster_dict[i] = clusters
    c_list.append(kproto.cost_)
    print("------------------------------------------------------")

sns.lineplot(y=c_list, x=range(0, len(c_list)))

y = c_list
x = range(1, len(y) + 1)
kn = KneeLocator(x, y, curve='convex', direction='decreasing')
print("Number of clusters : ", kn.knee)

final_cluster = cluster_dict[kn.knee]
cd = {}
for i in range(kn.knee):
    cd[i] = []
예제 #23
0
    encoding='utf-8',
    index=True)

# Estimation: K-Prototypes #
# Testing Number of Clusters #
K_MAX = 29
centroids_huang = []
centroids_cao = []
labels_huang = []
labels_cao = []
gamma_huang = []
gamma_cao = []
KK = range(1, K_MAX + 1)
for k in KK:
    km = KPrototypes(n_clusters=k, init='Huang', n_init=10, verbose=1)
    km.fit_predict(df_norm.values, categorical=[39, 40, 41])
    centroids_huang.append(km.cluster_centroids_)
    labels_huang.append(km.labels_)
    gamma_huang.append(km.gamma)
    km = KPrototypes(n_clusters=k, init='Cao', n_init=10, verbose=1)
    km.fit_predict(df_norm.values, categorical=[39, 40, 41])
    centroids_cao.append(km.cluster_centroids_)
    labels_cao.append(km.labels_)
    gamma_cao.append(km.gamma)
D_k_huang = [
    gower_distance_tocentroid(df_norm, cent, 1) for cent in centroids_huang
]
D_k_cao = [
    gower_distance_tocentroid(df_norm, cent, 1) for cent in centroids_cao
]
# axis=0: Horizontal. axis=1: Vertical
예제 #24
0
#Menggabungkan dataframe
df_model = df_encode.merge(df_standar,
                           left_index=True,
                           right_index=True,
                           how='left')

from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
import seaborn as sns

#Melakukan Iterasi untuk mendapatkan nilai Cost
cost = {}
for k in range(2, 10):
    kproto = KPrototypes(n_clusters=k, random_state=75)
    kproto.fit_predict(df_model, categorical=[0, 1, 2])
    cost[k] = kproto.cost_

#Visualisasi Elbow Plot
sns.pointplot(x=list(cost.keys()), y=list(cost.values()))
plt.show()

#Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot
import pickle

kproto = KPrototypes(n_clusters=5, random_state=75)
kproto = kproto.fit(df_model, categorical=[0, 1, 2])
pickle.dump(kproto, open('best_cluster.pkl', 'wb'))

#Menentukan segmen tiap pelanggan
clusters = kproto.predict(df_model, categorical=[0, 1, 2])
예제 #25
0
#!/usr/bin/env python

import numpy as np
from kmodes.kprototypes import KPrototypes

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('people.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('people.csv', dtype=object, delimiter=',')[:, 1:]
X[:, 0] = X[:, 0].astype(float)

weights = [1] * 4
weights[2] = 100
weights[3] = 100

kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2)
clusters = kproto.fit_predict(X, categorical=[2], sample_weight=weights)

# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(syms, clusters):
    print(f"Symbol: {s}, cluster:{c}")
예제 #26
0
        data=data,
        num_numerical=num_numerical_features,
        num_category=num_category_features,
        max_iters=10,
        mode=2)
    print("K_Means算法的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, label_2)))
    label_3, center_numerical_3, center_category_3 = K_Prototypes(
        random_seed=2020,
        n=N,
        data=data,
        num_numerical=num_numerical_features,
        num_category=num_category_features,
        max_iters=10,
        mode=1)
    print("K_Modes算法的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, label_3)))
    kp = KPrototypes(n_clusters=5,
                     init='Huang',
                     n_init=1,
                     verbose=True,
                     n_jobs=4,
                     random_state=2020)
    KPrototypes_results = kp.fit_predict(
        data,
        categorical=list(
            range(num_numerical_features,
                  num_numerical_features + num_category_features - 1)))
    print("K_Prototypes算法包的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, KPrototypes_results)))
예제 #27
0
#       model parameters
#
evalu = data_cats.copy()
evalu.drop(['GoodForKids'], axis=1, inplace=True)
evaluate_clusters(evalu, 50)
init = 'Huang'  # init can be 'Cao', 'Huang' or 'random'
n_clusters = 20  # how many clusters (hyper parameter)
max_iter = 100  # default 100

#       get the model
#
kproto = KPrototypes(n_clusters=n_clusters, init=init, verbose=2)
#
#       fit/predict
#
clusters = kproto.fit_predict(data_cats_matrix,
                              categorical=categoricals_indicies)
#
#       combine dataframe entries with resultant cluster_id
#
proto_cluster_assignments = zip(data_cats_matrix, clusters)

#       Instantiate dataframe to house new cluster data
#
cluster_df = pd.DataFrame(columns=('GoodForKids', 'stars',
                                   'RestaurantsPriceRange2', 'latitude',
                                   'longitude', 'Fac1', 'Fac2', 'cluster_id'))
#
#       load arrays back into a dataframe
#
for array in proto_cluster_assignments:
    cluster_df = cluster_df.append(
예제 #28
0
def cluster_clients(k=None, save_centroids=True, save_clusters=True):
    '''
    Runs k-prototypes clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.")
        raw_df = load_raw_data(cfg)
        client_df = prepare_for_clustering(cfg, raw_df,  save_df=False)
    excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE']
    client_df.drop(excluded_feats, axis=1, inplace=True)   # Features we don't want to see in clustering
    client_feats_df = client_df.copy()
    client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist()
    cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats]
    bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats]
    ordinal_encoder = OrdinalEncoder()
    client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats])
    X = np.array(client_df)

    # Get list of categorical feature indices. Boolean feats are considered categorical for clustering
    cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df]
    numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs]

    # Normalize noncategorical features
    X_noncat = X[:, numcl_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, numcl_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0),
                                np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \
        k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0),
                                                     np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters})
    clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left')
    clusters_df.set_index('CONTRACT_ACCOUNT')

    # Get centroids of clusters
    cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1]  # Categorical features

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns))
    for i in range(len(cat_feats)):
        ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))}
        centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict)
    centroids_df[bool_feats] = centroids_df[bool_feats].round()
    cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Save centroid features and cluster assignments to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False, index=False)
    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False, index=False)
    return k_prototypes
예제 #29
0
import numpy as np
from sklearn import datasets

from kmodes.kprototypes import KPrototypes

iris = datasets.load_iris()

data = np.c_[iris['data'], iris['target']]

kp = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=True)
kp.fit_predict(data, categorical=[4])

print(kp.cluster_centroids_)
print(kp.labels_)
예제 #30
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from kmodes.kprototypes import KPrototypes
import pickle

dataset = pd.read_csv('model_data.csv')

matrix = dataset.to_numpy()

kproto = KPrototypes(n_clusters=5, init='Cao')
kproto.fit_predict(matrix,
                   categorical=[
                       6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                       21, 22, 23, 24
                   ])

pickle.dump(kproto, open('model.pkl', 'wb+'))
예제 #31
0
def infer_reachability(summaries, settings):
    all_prop = None
    inferences_per_summary = [set() for _ in summaries]
    for i, summary in enumerate(summaries):
        prop = {}
        ranks = []
        for flow in summary.get_flows():
            for edge in summary.get_edges(flow):
                rank = round(summary.get_edge_rank(flow, edge),
                             settings.precision)
                if rank > float(settings.threshold):
                    policy = nopticon.ReachabilityPolicy({
                        'flow': flow,
                        'source': edge[0],
                        'target': edge[1]
                    })
                    prop[policy] = len(ranks)
                    if settings.equiv_classes:
                        # TODO: genericize class
                        ranks.append([rank, edge[0][0], edge[1][0]])
                    else:
                        ranks.append([rank])

        if settings.equiv_classes:
            kproto = KPrototypes(n_clusters=2, init='cao')
            clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2])
        else:
            agg = AgglomerativeClustering(n_clusters=2, linkage="ward")
            clust = agg.fit(ranks).labels_

        fig = plt.figure()
        ax = plt.subplot()

        colors = ['green', 'red', 'blue', 'purple', 'cyan', 'orange']

        means = {}
        high = None
        for k in set(clust):
            kranks = [
                ranks[idx][0] for idx in prop.values() if clust[idx] == k
            ]
            means[k] = sum(kranks) / len(kranks)
            if high is None or means[k] > means[high]:
                high = k

        props_to_isect = set(
            [p for p, idx in prop.items() if clust[idx] == high])
        inferences_per_summary[i] = props_to_isect
        if all_prop is None:
            all_prop = props_to_isect
        else:
            all_prop = all_prop.intersection(props_to_isect)

        # exp_colors = ['green' for _ in ranks]
        # for (f,s,t), idx in prop.items():
        #     if t[0] != 'l':
        #         exp_colors[idx] = 'red'

        #         clust_colors = [colors[l] if l >= 0 else "black" for l in clust.labels_]

        #         for k in range(0,2):
        #             ax.scatter(clust.labels_, [r for rs in ranks for r in rs],
        #                        c=exp_colors)

        # fig.savefig("cluster.png")

    return (all_prop, inferences_per_summary)