예제 #1
0
파일: main.py 프로젝트: dgcnz/k-all
def exec_kprototypes(df, choices_obj):

    print("Whitening data...", end='', flush=True)
    for header in choices_obj['numerical']:
        df[header + "_scaled"] = whiten(df[header])
    print("Done.")
    nums_scaled = [header + "_scaled" for header in choices_obj['numerical']]
    cats_not_scaled = [header for header in choices_obj['categorical']]

    X = pd.concat(
        [df[nums_scaled].astype(float), df[cats_not_scaled].astype(str)],
        axis=1)
    k = int(input("Number of clusters:\n > "))
    kproto = KPrototypes(n_clusters=k, init='Cao', verbose=2)

    df['cluster_labels'] = kproto.fit_predict(
        X.values,
        categorical=list(
            range(len(X.columns) - len(cats_not_scaled), len(X.columns))))
    if (len(nums_scaled) >= 2):
        # Plot clusters
        print("Only showing 2 dimensions of data (picking first two headers)")
        sns.scatterplot(x=nums_scaled[0],
                        y=nums_scaled[1],
                        hue='cluster_labels',
                        data=df)
        plt.show()
예제 #2
0
파일: gco.py 프로젝트: acardosoj/ml
 def k_prototypes_fitness(self,individual):
 
 
     self.individual = individual
     df_cluster=self.X.copy()   
     if self.add_target:
         self.individual = [1] + self.individual
     
     
     #check if calculation was already made upt to 2nd decimal
     inf_curr = [round(float(y),2) for y in individual]
     for x in self.results:
         ind_test_norm = [round(float(y),2) for y in x[:-1]]
         if ind_test_norm == inf_curr:
             print('já calculado')
             return float(x[-1]),
     
     #weights on kmeans
     for i in self.numerical_index:
         df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] 
     random.seed(10)
     kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                     max_iter=5, verbose=0, gamma=1,n_init=1, init = 'random', random_state=10)
     kproto.fit(df_cluster.values,categorical = self.categorical_index)            
     ftnss = self.calculate_fitness(kproto.labels_,kproto)                
     
     self.save_scoring(self.individual,ftnss,kproto)
     self.results.append(self.individual + [ftnss])
     
     return ftnss,
예제 #3
0
    def predict(self):
        with open(self.data_processed, 'rb') as f:
            self.dataset = pickle.load(f)
        with open(self.label_file, 'rb') as f:
            self.label = pickle.load(f)

        # self.y_pred = KMeans(n_clusters=5, random_state=9).fit_predict(self.dataset)
        # np.savetxt(self.cluster_result, np.hstack(self.y_pred, self.dataset) , delimiter=',')
        # score = metrics.calinski_harabaz_score(self.dataset, self.y_pred)
        # print(score)
        kproto = KPrototypes(n_clusters=5, init='Cao', verbose=2)
        clusters = kproto.fit_predict(self.dataset, categorical=[1])

        temp = np.loadtxt(fname=self.data_cleaned, dtype=object, delimiter=',')
        room_identity = temp[1:, :3]

        self.result = np.column_stack((room_identity, self.dataset, clusters))

        print(kproto.cluster_centroids_)
        # Print training statistics
        print(kproto.cost_)
        print(kproto.n_iter_)

        with open(self.result_binary, 'wb') as f:
            pickle.dump(self.result, f)
        with open('kproto_res', 'wb') as f:
            pickle.dump(kproto, f)

        with open(self.cluster_result, 'w') as f:
            re = self.result.tolist()
            for line in re:
                f.write("\t".join(list(map(str, line))) + '\n')

        for s, c in zip(self.label, clusters):
            print("Room identity: {}, cluster:{}".format(s, c))
예제 #4
0
def get_knee_results(data, cluster_lims, cores, categorical):

    knee_results = []
    cluster_range = range(*cluster_lims)
    for n_clusters in tqdm(cluster_range):

        kp = KPrototypes(n_clusters, init="cao", random_state=0, n_jobs=cores)
        kp.fit(data[cols], categorical=categorical)

        knee_results.append(kp.cost_)

    kl = KneeLocator(
        cluster_range,
        knee_results,
        curve_nature="convex",
        curve_direction="decreasing",
    )

    n_clusters = kl.knee

    with open(OUT_DIR / "n_clusters.txt", "w") as f:
        f.write(str(n_clusters))

    knee_results = pd.Series(index=cluster_range, data=knee_results)
    knee_results.to_csv(OUT_DIR / "knee_results.csv", header=False)

    return n_clusters
예제 #5
0
def kprototypesCluster(features: np.array, catCols: list, nClust: int):

    #Convert continous features to astype float
    model = KPrototypes(n_clusters=nClust, verbose=2)
    clusters = model.fit_predict(features, categorical=catCols)

    return model
예제 #6
0
    def kproto(self, K=20, N=int(1e5), MN=4, T=10, type='cao', save=True):
        data = self.to_numpy()

        M = data.shape[1]
        # MN = 22
        if type == 'huang':
            model = KPrototypes(n_clusters=K,
                                init='Huang',
                                n_init=1,
                                verbose=1)
        if type == 'cao':
            model = KPrototypes(n_clusters=K,
                                init='Cao',
                                verbose=2,
                                max_iter=10000)

        clusters = model.fit_predict(
            data,
            categorical=[0, 3, 6, 8] if self.cl_type == 'prop' else [
                0, 2, 3, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
            ])
        if save:
            self.save(model, 'Clustering_kproto_model')
        return np.array(model.cluster_centroids_[0]), np.array(
            model.cluster_centroids_[1]), np.array(clusters)
예제 #7
0
    def making_model(self):
        kproto = KPrototypes ( n_clusters = 5, random_state = 75)  
        kproto = kproto.fit(self.df_model, categorical=[0,1,2])  
        
        #Save Model  
        pickle.dump(kproto, open('cluster.pkl', 'wb'))

        self.kproto = kproto
예제 #8
0
def get_labels(data, n_clusters, cores, categorical):

    kp = KPrototypes(n_clusters,
                     init="matching",
                     n_init=50,
                     random_state=0,
                     n_jobs=cores)
    kp.fit(data[cols], categorical=categorical)
    print(kp.cost_)

    return kp.labels_
예제 #9
0
 def find_optimalCluster(self): # Mencari Jumlah Cluster yang Optimal
     # Melakukan Iterasi untuk Mendapatkan nilai Cost  
     cost = {}  
     for k in range(2,10):  
         kproto = KPrototypes(n_clusters = k, random_state = 75)  
         kproto.fit_predict(self.df_model, categorical = [0,1,2])  
         cost[k]= kproto.cost_
     
     # Memvisualisasikan Elbow Plot  
     sns.pointplot(x = list(cost.keys()), y = list(cost.values()))  
     plt.show()
def kprotoypes_cluster(df, n_clusters, category, hover_text):
    datadf = df.loc[:, df.columns != hover_text]
    kmodes_instance = KPrototypes(n_clusters=n_clusters, init='Cao', verbose=2)
    clusters = kmodes_instance.fit_predict(datadf, categorical=category)
    data_array = np.array(datadf.to_numpy().tolist())
    col_len = len(datadf.columns)
    if (col_len == 2):
        clus = scat2d(data_array, clusters, hover_text, df)
        return clus
    else:
        clus = scat3d(data_array, clusters, hover_text, df)
        return clus
예제 #11
0
    def kproto(self):  # TODO- solve clustering issue with PCA + K-means
        cluster_data = self.data
        opt_k = self.silouhette_analysis(cluster_data, prototype=True)

        kp = KPrototypes(n_clusters=opt_k)
        kp.fit(cluster_data, categorical=self.categorical_features)
        labels = kp.predict(cluster_data,
                            categorical=self.categorical_features)

        cluster_data['labels'] = labels
        self.data_clustered = cluster_data

        return cluster_data
예제 #12
0
def KPrototypes_cluster(input_data, k_clusters):
    from kmodes.kprototypes import KPrototypes
    #normalized data
    normalized = preprocessing.StandardScaler()
    input_data[input_data.select_dtypes(
        include=['float', 'integer']).columns] = normalized.fit_transform(
            input_data[input_data.select_dtypes(
                include=['float', 'integer']).columns])
    input_data = input_data.as_matrix()

    kproto = KPrototypes(n_clusters=k_clusters, init='Cao', verbose=2)
    clus_kmeans_fit = kproto.fit_predict(input_data,
                                         categorical=[0, 1, 2, 3, 4, 5, 6, 7])
    return (clus_kmeans_fit)
예제 #13
0
파일: exp.py 프로젝트: pzhang101/nopticon
def cluster(summ, agg_classes=None):
    """
    Clusters summary info using DBSCAN if agg_classes is provided it uses K-Prototypes
    """
    all_prop = None
    prop = {}
    ranks = []
    for flow, edge in summ.get_flowedges():
        rank = round(summ.get_edge_rank(flow, edge), 2)
        if rank < 0.5:
            continue
        # else:
        # print(flow, edge, rank)

        policy = nopticon.ReachabilityPolicy({
            'flow': flow,
            'source': edge[0],
            'target': edge[1]
        })
        prop[policy] = len(ranks)
        if agg_classes is not None:
            ranks.append([rank, agg_classes[edge[0]], agg_classes[edge[1]]])
        else:
            ranks.append([rank])

    if agg_classes is not None:
        kproto = KPrototypes(n_clusters=3, init='Huang')
        clust = kproto.fit_predict(np.matrix(ranks).A, categorical=[1, 2])
    else:
        agg = KMeans(n_clusters=2, n_jobs=2)  # linkage="complete")
        clust = agg.fit(ranks).labels_

    assert len(clust) == len(ranks)
    means = {}
    high = None
    for k in set(clust):
        kranks = [ranks[idx][0] for idx in prop.values() if clust[idx] == k]
        means[k] = sum(kranks) / len(kranks)
        if high is None or means[k] > means[high]:
            high = k

    for p, idx in prop.items():
        if clust[idx] == high:
            # print("\tHIGH:", ranks[idx], p)
            summ.mark_cluster_accepted(p.flow(), p.edge())
        # else:
        # print("\tlow:", ranks[idx], p)
    return
예제 #14
0
    def get_clusters(self,
                     df,
                     var_list,
                     k_values,
                     map_sa_districts,
                     path_out,
                     cat_list=[]):

        for k in k_values:
            # k prototype
            KPro_model = KPro(n_clusters=k)
            #df_geo.loc[:, columns4] = preprocessing.normalize(df_geo.loc[:, columns4].values)
            KPro_fit = KPro_model.fit(X=df[var_list], categorical=cat_list)
            df['KPrototype cluster labels'] = KPro_fit.labels_

            # plot
            self.plot_clusters(k, df, var_list, map_sa_districts, path_out)

        return df
예제 #15
0
def kprototypes_compute_metrics_for_every_cluster_number(
        clusters_range_lower_bound,
        clusters_range_upper_bound,
        dataset,
        distance_algorithm,
        init_Cao_or_Huang_for_kprototypes,
        list_categorical_features_indeces_for_kprototypes,
        print_optimum_metrics=True):

    kprototypes_list_metrics = []

    for num_of_clusters in range(clusters_range_lower_bound,
                                 clusters_range_upper_bound):

        kprototypes = KPrototypes(n_clusters=int(num_of_clusters),
                                  init=str(init_Cao_or_Huang_for_kprototypes),
                                  n_init=50,
                                  verbose=0)
        predictions = kprototypes.fit_predict(
            dataset,
            categorical=list_categorical_features_indeces_for_kprototypes)
        centers = kprototypes.cluster_centroids_
        cost_function = kprototypes.cost_
        num_jobs = kprototypes.n_iter_
        error_metric = cost_function
        silhouette = silhouette_score(dataset, predictions, distance_algorithm)

        kprototypes_list_metrics.append({
            'clusters': num_of_clusters,
            'silhouette': silhouette,
            'error': error_metric,
            'num_jobs': num_jobs
        })
        if print_optimum_metrics is True:
            print(
                "For n_clusters = {}, silhouette score is {}, cluster_errors is {}, "
                "n_jobs {})".format(num_of_clusters, silhouette, error_metric,
                                    num_jobs))

    return kprototypes_list_metrics
예제 #16
0
def ClusterCreation(request,*args):
    global kproto

    #Example of clustering with random data
    '''
    # random categorical data
    data = np.array([
            [0,'a',4],
            [1,'e',3],
            [6,'ffed',15],
            [5,'fdfd',16]
            ])

    kproto = KPrototypes(n_clusters=2, init='Cao', verbose=2)
    clusters = kproto.fit(data, categorical=[1])
    # Create CSV with cluster statistics
    clusterStatisticsCSV(kproto)
    for argument in args:
        if argument is not None:
            return
    '''
    # Get data from database
    rows=get_training_data()
    # Cast as numpy Array
    rows_array=np.array(rows)
    #Split data into variables and id's
    data_array = np.array(rows_array)[:,1:] #dejamos sólo las variables que pueden clusterizar el cliente
    ids_array = np.array(rows_array)[:, 0] #guardamos las id's en otro array

    #Clustering
    kproto = KPrototypes(n_clusters=3, init='Cao', verbose=2)
    #clusters = kproto.fit(data_array, categorical=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])
    clusters = kproto.fit(data_array,categorical=[1, 2, 3, 4])
    # Create CSV with cluster statistics
    clusterStatisticsCSV(kproto)
    for argument in args:
        if argument is not None:
            return

    return HttpResponse('Clustering realizado y CSV report generado')
예제 #17
0
파일: gco.py 프로젝트: acardosoj/ml
    def return_best_cluster(self,df_cluster,cluster_param):
        if self.cluster_method == 'kprototypes':
            
            #weights on kmeans
            
          
            for i in self.numerical_index:
                df_cluster.iloc[:,i] = self.individual[i] * df_cluster.iloc[:,i] 

            if os.path.exists(self.folder + 'cluster_init.json'):
                with open(self.folder + 'cluster_init.json') as f:
                    cluster_init = json.load(f)
                ftnss = 100000
                for init in cluster_init:    
                    init = [ np.array(init[0]),np.array(init[1])] 
                    kproto = KPrototypes(n_clusters=cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                    max_iter=5, verbose=1, gamma=1,n_init=1, init = init)
                    kproto.fit(df_cluster.values,categorical = self.categorical_index)    

                    x = pd.DataFrame([])
                    x['cluster'] = kproto.labels_
                    x['target'] = self.target
                    df_grouped = x.groupby(['cluster'])['target'].max() - x.groupby(['cluster'])['target'].min()
                    curr_ftnss = (df_grouped.values).sum()   
                    print(ftnss) 

                    print(curr_ftnss < ftnss)       
                    winner_model = kproto          
                           
                    if curr_ftnss < ftnss:
                        ftnss = curr_ftnss     
                        winner_model = kproto  
            else:
                kproto = KPrototypes(n_clusters=self.cluster_param, cat_dissim=self.matching_dissim_weighted, #num_dissim=euclidean_dissim_weighted, 
                                max_iter=5, verbose=1, gamma=1,n_init=1, init = 'Cao')
                kproto.fit(df_cluster.values,categorical = self.categorical_index)            
                curr_ftnss = self.calculate_fitness(kproto.labels_)                
                winner_model = kproto                     
            
            dump(winner_model,self.folder+'best_model.joblib')
            self.df['cluster'] = winner_model.labels_
            return winner_model
        
        elif self.cluster_method == 'hdbscan':
            clusterer = hdb.HDBSCAN(min_cluster_size=cluster_param, prediction_data=True)
            clusterer.fit(df_cluster)    
            dump(clusterer,self.folder+'best_model.joblib')
            self.df['cluster'] = clusterer.labels_
            return clusterer   
예제 #18
0
    def agruparDados(self, file):

        style.use("ggplot")
        caminho = 'C:/Users/Teste/Desktop/10 semestre/tcc2/Arquivos de Logs/Arquivos de Logs/Ameaças/Novos/trainThreats.csv'
        colors = [
            'b', 'orange', 'g', 'r', 'c', 'm', 'y', 'k', 'Brown', 'ForestGreen'
        ]
        # Data points with their publisher name,category score, category name, place name
        #category = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 9]  # categoria
        #severity = np.genfromtxt(caminho, dtype=str, delimiter=',', skip_header=1)[:, 8]  # severidade
        X = np.genfromtxt(caminho, dtype=object, delimiter=',',
                          skip_header=1)[:, 1:]

        kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2)
        clusters = kproto.fit_predict(
            X, categorical=[0, 1, 2, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14])

        file['Clusters'] = clusters

        # Print cluster centroids of the trained model.
        print(kproto.cluster_centroids_)
        # Print training statistics
        print(kproto.cost_)
        print(kproto.n_iter_)
        print(kproto.gamma)
        '''plt.scatter(X[clusters == 0, 8], X[clusters == 0, 9], c='purple', alpha=0.5, s=150,  label='Cluster 0')
        plt.scatter(X[clusters == 1, 8], X[clusters == 1, 9], c='black', alpha=0.5, s=150,  label='Cluster 1')
        plt.scatter(X[clusters == 2, 8], X[clusters == 2, 9], c='red', alpha=0.5, s=150,  label='Cluster 2')
        plt.scatter(X[clusters == 3, 8], X[clusters == 3, 9], c='green', alpha=0.5, s=150,  label='Cluster 3')
        plt.scatter(X[clusters == 4, 8], X[clusters == 4, 9], c='blue', alpha=0.5, s=100, label='Cluster 4')
        plt.scatter(X[clusters == 5, 8], X[clusters == 5, 9], c='yellow', alpha=0.5, s=100, label='Cluster 5')
        plt.xlabel('Severity')
        plt.ylabel('Category')
        plt.legend()
        plt.show()'''

        self.lerXML(file)
예제 #19
0
    def get_labels(self, data, reprocess=False):
        data_original = data
        data = self._preprocessed_data(data)

        categorical_indices = get_categorical_indices(data)
        if not categorical_indices:
            return self._fallback_algorithm(data_original)

        if self.model is None or reprocess:
            data = encode_nominal_parameters(data)
            data = normalized_dataset(data, categorical_indices)

            initial_centers = self._get_initial_centers(
                data, categorical_indices)
            self.model = KPrototypes(n_clusters=self.cluster_number,
                                     max_iter=1000,
                                     init=initial_centers,
                                     n_init=10,
                                     gamma=self.categorical_weight,
                                     num_dissim=dissimilarity_python.euclidean,
                                     n_jobs=1)
            data = data.values
            self.model.fit(data, categorical=categorical_indices)
            self.labels = self.model.predict(data,
                                             categorical=categorical_indices)
            self.centers = self.model.cluster_centroids_
            centers = self.centers[0]
            for index, cat_index in enumerate(categorical_indices):
                centers = np.insert(centers,
                                    cat_index,
                                    values=self.centers[1].transpose()[index],
                                    axis=1)
            self.centers = centers
        else:
            self.labels = self.model.predict(data)

        return self.labels
def plot_costs(X, min_k, max_k):
    """Plots sse for values of k between min_k and max_k
    Args:
    - X - feature matrix
    - min_k, max_k - smallest and largest k to plot sse for
    return: list of costs
    """
    k_values = range(min_k, max_k + 1)
    costs = []
    for k in k_values:
        kp = KPrototypes(n_clusters=k,
                         init='Cao',
                         n_init=22,
                         verbose=0,
                         random_state=4,
                         n_jobs=4)
        kp.fit_predict(X, categorical=[1, 2, 3])
        costs.append(kp.cost_)
    plt.plot(k_values, costs)
    plt.xlabel('k')
    plt.ylabel('costs')
    plt.show()
    plt.savefig("../image/kprototype_costs.png")
    return costs
예제 #21
0
def clusterization(data, clusters, method):
    if method is 'kmeans':
        model = KMeans(n_clusters=clusters, init='random', algorithm='full')
        model.fit(data)
        clustering_labels = model.predict(data)
    elif method is 'agglomerative':
        linkage = ('ward', 'average', 'complete', 'single')
        model = AgglomerativeClustering(linkage=linkage[0], n_clusters=clusters)
        model.fit(data)
        clustering_labels = model.labels_
    elif method is 'fuzzy':
        cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(data.T, clusters, 2, error=0.005, maxiter=10000, init=None)
        clustering_labels = np.argmax(u, axis=0)
    elif method is 'kprototypes':
        clustering_labels = KPrototypes(n_clusters=clusters, init='random', gamma=0.1, n_init=1).fit_predict(data, categorical=list([8]))
    else:
        print(" The supported methods are: kmeans, agglomerative, fuzzy ...")
    return clustering_labels
예제 #22
0
def create_elbowgraph(n, df, type="kmeans", categorical=[0]):
    if type == "kmeans":
        clusters = []
        for i in range(1, n):
            kmeans = KMeans(n_clusters=i, random_state=1).fit(df)
            clusters.append(kmeans.inertia_)
            print("Calculated kmeans with " + str(i) + " clusters")

    elif type == "kproto":
        clusters = []
        for i in range(1, n):
            kproto = KPrototypes(n_clusters=i, init='random',
                                 random_state=1).fit(df,
                                                     categorical=categorical)
            clusters.append(kproto.cost_)
            print("Calculated kproto with " + str(i) + " clusters")

    plt.plot(range(1, n), clusters, 'go--')
    plt.title("Elbow graph")
    plt.xlabel("Number of cluster")
    plt.ylabel("within-cluster sum-of-squares (inertia)")
예제 #23
0
import numpy as np
from sklearn import datasets

from kmodes.kprototypes import KPrototypes

iris = datasets.load_iris()

data = np.c_[iris['data'], iris['target']]

kp = KPrototypes(n_clusters=3, init='Huang', n_init=1, verbose=True)
kp.fit_predict(data, categorical=[4])

print(kp.cluster_centroids_)
print(kp.labels_)
예제 #24
0
        data=data,
        num_numerical=num_numerical_features,
        num_category=num_category_features,
        max_iters=10,
        mode=2)
    print("K_Means算法的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, label_2)))
    label_3, center_numerical_3, center_category_3 = K_Prototypes(
        random_seed=2020,
        n=N,
        data=data,
        num_numerical=num_numerical_features,
        num_category=num_category_features,
        max_iters=10,
        mode=1)
    print("K_Modes算法的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, label_3)))
    kp = KPrototypes(n_clusters=5,
                     init='Huang',
                     n_init=1,
                     verbose=True,
                     n_jobs=4,
                     random_state=2020)
    KPrototypes_results = kp.fit_predict(
        data,
        categorical=list(
            range(num_numerical_features,
                  num_numerical_features + num_category_features - 1)))
    print("K_Prototypes算法包的Calinski-Harabaz Index值为:{}".format(
        metrics.calinski_harabasz_score(data, KPrototypes_results)))
예제 #25
0
def cluster_clients(k=None, save_centroids=True, save_clusters=True):
    '''
    Runs k-prototypes clustering algorithm on preprocessed dataset
    :param k: Desired number of clusters
    :param save_centroids: Boolean indicating whether to save cluster centroids
    :param save_clusters: Boolean indicating whether to save client cluster assignments
    :return: A KPrototypes object that describes the best clustering of all the runs
    '''
    cfg = yaml.full_load(open(os.getcwd() + "/config.yml", 'r'))

    # Load preprocessed client data
    try:
        client_df = pd.read_csv(cfg['PATHS']['CLIENT_DATA'])
    except FileNotFoundError:
        print("No file found at " + cfg['PATHS']['CLIENT_DATA'] + ". Running preprocessing of client data.")
        raw_df = load_raw_data(cfg)
        client_df = prepare_for_clustering(cfg, raw_df,  save_df=False)
    excluded_feats = cfg['K-PROTOTYPES']['FEATS_TO_EXCLUDE']
    client_df.drop(excluded_feats, axis=1, inplace=True)   # Features we don't want to see in clustering
    client_feats_df = client_df.copy()
    client_ids = client_df.pop('CONTRACT_ACCOUNT').tolist()
    cat_feats = [f for f in cfg['DATA']['CATEGORICAL_FEATS'] if f not in excluded_feats]
    bool_feats = [f for f in cfg['DATA']['BOOLEAN_FEATS'] if f not in excluded_feats]
    ordinal_encoder = OrdinalEncoder()
    client_df[cat_feats] = ordinal_encoder.fit_transform(client_df[cat_feats])
    X = np.array(client_df)

    # Get list of categorical feature indices. Boolean feats are considered categorical for clustering
    cat_feat_idxs = [client_df.columns.get_loc(c) for c in cat_feats + bool_feats if c in client_df]
    numcl_feat_idxs = [i for i in range(len(client_df.columns)) if i not in cat_feat_idxs]

    # Normalize noncategorical features
    X_noncat = X[:, numcl_feat_idxs]
    std_scaler = StandardScaler().fit(X_noncat)
    X_noncat = std_scaler.transform(X_noncat)
    X[:, numcl_feat_idxs] = X_noncat

    # Run k-prototypes algorithm on all clients and obtain cluster assignment (range [1, K]) for each client
    if k is None:
        k = cfg['K-PROTOTYPES']['K']
    k_prototypes = KPrototypes(n_clusters=k, verbose=1, n_init=cfg['K-PROTOTYPES']['N_RUNS'],
                               n_jobs=cfg['K-PROTOTYPES']['N_JOBS'], init='Cao', num_dissim=euclidean_dissim,
                               cat_dissim=matching_dissim)
    client_clusters = k_prototypes.fit_predict(X, categorical=cat_feat_idxs)
    k_prototypes.samples = X
    k_prototypes.labels = client_clusters
    k_prototypes.dist = lambda x0, x1: \
        k_prototypes.num_dissim(np.expand_dims(x0[numcl_feat_idxs], axis=0),
                                np.expand_dims(x1[numcl_feat_idxs], axis=0)) + \
        k_prototypes.gamma * k_prototypes.cat_dissim(np.expand_dims(x0[cat_feat_idxs], axis=0),
                                                     np.expand_dims(x1[cat_feat_idxs], axis=0))
    client_clusters += 1  # Enforce that cluster labels are integer range of [1, K]
    clusters_df = pd.DataFrame({'CONTRACT_ACCOUNT': client_ids, 'Cluster Membership': client_clusters})
    clusters_df = clusters_df.merge(client_feats_df, on='CONTRACT_ACCOUNT', how='left')
    clusters_df.set_index('CONTRACT_ACCOUNT')

    # Get centroids of clusters
    cluster_centroids = np.empty((k_prototypes.cluster_centroids_[0].shape[0],
                                  k_prototypes.cluster_centroids_[0].shape[1] +
                                  k_prototypes.cluster_centroids_[1].shape[1]))
    cluster_centroids[:, numcl_feat_idxs] = k_prototypes.cluster_centroids_[0]  # Numerical features
    cluster_centroids[:, cat_feat_idxs] = k_prototypes.cluster_centroids_[1]  # Categorical features

    # Scale noncategorical features of the centroids back to original range
    centroid_noncat_feats = cluster_centroids[:, numcl_feat_idxs]
    centroid_noncat_feats = std_scaler.inverse_transform(centroid_noncat_feats)
    cluster_centroids[:, numcl_feat_idxs] = centroid_noncat_feats

    # Create a DataFrame of cluster centroids
    centroids_df = pd.DataFrame(cluster_centroids, columns=list(client_df.columns))
    for i in range(len(cat_feats)):
        ordinal_dict = {j: ordinal_encoder.categories_[i][j] for j in range(len(ordinal_encoder.categories_[i]))}
        centroids_df[cat_feats[i]] = centroids_df[cat_feats[i]].map(ordinal_dict)
    centroids_df[bool_feats] = centroids_df[bool_feats].round()
    cluster_num_series = pd.Series(np.arange(1, cluster_centroids.shape[0] + 1))
    centroids_df.insert(0, 'Cluster', cluster_num_series)

    # Get fraction of clients in each cluster
    cluster_freqs = np.bincount(client_clusters) / float(client_clusters.shape[0])
    centroids_df.insert(1, '% of Clients', cluster_freqs[1:] * 100)

    # Save centroid features and cluster assignments to spreadsheet
    if save_centroids:
        centroids_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CENTROIDS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                            index_label=False, index=False)
    if save_clusters:
        clusters_df.to_csv(cfg['PATHS']['K-PROTOTYPES_CLUSTERS'] + datetime.now().strftime("%Y%m%d-%H%M%S") + '.csv',
                           index_label=False, index=False)
    return k_prototypes
예제 #26
0
#!/usr/bin/env python

import numpy as np
from kmodes.kprototypes import KPrototypes

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:]
X[:, 0] = X[:, 0].astype(float)

kproto = KPrototypes(n_clusters=4, init='Cao', verbose=2)
clusters = kproto.fit_predict(X, categorical=[1, 2])

# Print cluster centroids of the trained model.
print(kproto.cluster_centroids_)
# Print training statistics
print(kproto.cost_)
print(kproto.n_iter_)

for s, c in zip(syms, clusters):
    print(f"Symbol: {s}, cluster:{c}")
예제 #27
0
def cao():
    KPrototypes(n_clusters=K, init='Cao', verbose=2)\
        .fit_predict(data, categorical=list(range(M - MN, M)))
예제 #28
0
def huang():
    KPrototypes(n_clusters=K, init='Huang', n_init=1, verbose=2)\
        .fit_predict(data, categorical=list(range(M - MN, M)))
예제 #29
0
#!/usr/bin/env python
import numpy as np
from kmodes.kprototypes import KPrototypes
import pandas as pd

# stocks with their market caps, sectors and countries
syms = np.genfromtxt('stocks.csv', dtype=str, delimiter=',')[:, 0]
X = np.genfromtxt('stocks.csv', dtype=object, delimiter=',')[:, 1:]

X[:, 0] = X[:, 0].astype(float)
kproto = KPrototypes(n_clusters=3, init='Cao', verbose=8)
clusters = kproto.fit_predict(
    X, categorical=[1, 2])  #TC: define categorical variables here

# Print cluster centroids of the trained model.
print("\nCluster centroid")
print(kproto.cluster_centroids_)
# Print training statistics
print("\nCost")
print(kproto.cost_)
print("\nNumber of iterations")
print(kproto.n_iter_)
"""for s, c in zip(syms, clusters):
    print("Symbol: {}, cluster:{}".format(s, c))"""

print("\nClustering result")
df = pd.DataFrame(zip(syms, clusters))
df.columns = ["Symbol", "Cluster"]
print(df)
예제 #30
0
ms.fit(df_cust_num_norm)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
df["Labels"] = ms.predict(df_cust_num_norm)
cols = customer_related_num + ["labels"]
cc_mshift = df[cols].groupby("labels").mean()
sizes = df["labels"].value_counts()

######## Categorical ###########
### 1. Approach: K-Prototype with categorical and numerical Features
scaler = StandardScaler()
cust_norm = scaler.fit_transform(df[customer_related_num])
df_num_norm = pd.DataFrame(cust_norm, columns=customer_related_num)
df_cust_norm = df_num_norm.join(df[customer_related_cat])
# create_elbowgraph(10, df_cust_norm, "kproto", [4,5,6,7,8] )
kproto = KPrototypes(n_clusters=3, init='random', random_state=1)
model = kproto.fit(df_cust_norm, categorical=[4, 5, 6, 7, 8, 9])
# Inverse Normalization for Interpretation
cc_kproto_num = pd.DataFrame(
    scaler.inverse_transform(X=model.cluster_centroids_[0]))
cc_kproto = pd.concat(
    [cc_kproto_num, pd.DataFrame(model.cluster_centroids_[1])], axis=1)
cc_kproto.columns = customer_related

###### 2. Approach: Categorical Kmodes ########
kmodes = KModes(n_clusters=4)
temp_kmodes = kmodes.fit_predict(df[customer_related_cat])
kmcc = pd.DataFrame(kmodes.cluster_centroids_, columns=customer_related_cat)

df["cat_cluster"] = temp_kmodes
예제 #31
0
    df_encode[col] = LabelEncoder().fit_transform(df_encode[col])

#Menggabungkan dataframe
df_model = df_encode.merge(df_standar,
                           left_index=True,
                           right_index=True,
                           how='left')

from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
import seaborn as sns

#Melakukan Iterasi untuk mendapatkan nilai Cost
cost = {}
for k in range(2, 10):
    kproto = KPrototypes(n_clusters=k, random_state=75)
    kproto.fit_predict(df_model, categorical=[0, 1, 2])
    cost[k] = kproto.cost_

#Visualisasi Elbow Plot
sns.pointplot(x=list(cost.keys()), y=list(cost.values()))
plt.show()

#Menyimpan model dengan jumlah cluster 5 berdasarkan Elbow Plot
import pickle

kproto = KPrototypes(n_clusters=5, random_state=75)
kproto = kproto.fit(df_model, categorical=[0, 1, 2])
pickle.dump(kproto, open('best_cluster.pkl', 'wb'))

#Menentukan segmen tiap pelanggan